Skip to content

Commit

Permalink
Retry platform-level errors in the isolated process for .NET isolated (
Browse files Browse the repository at this point in the history
…#2922)

Co-authored-by: Andy Staples <[email protected]>
  • Loading branch information
davidmrdavid and andystaples authored Oct 3, 2024
1 parent 1163944 commit 8a5db71
Show file tree
Hide file tree
Showing 7 changed files with 447 additions and 7 deletions.
78 changes: 75 additions & 3 deletions .github/workflows/smoketest-dotnet-isolated-v4.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,79 @@ jobs:
steps:
- uses: actions/checkout@v2

# Validation is blocked on https://github.com/Azure/azure-functions-host/issues/7995
- name: Run V4 .NET Isolated Smoke Test
run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/StartHelloCitiesTyped -NoValidation
# Install .NET versions
- name: Set up .NET Core 3.1
uses: actions/setup-dotnet@v3
with:
dotnet-version: '3.1.x'

- name: Set up .NET Core 2.1
uses: actions/setup-dotnet@v3
with:
dotnet-version: '2.1.x'

- name: Set up .NET Core 6.x
uses: actions/setup-dotnet@v3
with:
dotnet-version: '6.x'

- name: Set up .NET Core 8.x
uses: actions/setup-dotnet@v3
with:
dotnet-version: '8.x'

# Install Azurite
- name: Set up Node.js (needed for Azurite)
uses: actions/setup-node@v3
with:
node-version: '18.x' # Azurite requires at least Node 18

- name: Install Azurite
run: npm install -g azurite

- name: Restore WebJobs extension
run: dotnet restore $solution

- name: Build and pack WebJobs extension
run: cd ./src/WebJobs.Extensions.DurableTask &&
mkdir ./out &&
dotnet build -c Release WebJobs.Extensions.DurableTask.csproj --output ./out &&
mkdir ~/packages &&
dotnet nuget push ./out/Microsoft.Azure.WebJobs.Extensions.DurableTask.*.nupkg --source ~/packages &&
dotnet nuget add source ~/packages

- name: Build .NET Isolated Smoke Test
run: cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated &&
dotnet restore --verbosity normal &&
dotnet build -c Release

- name: Install core tools
run: npm i -g azure-functions-core-tools@4 --unsafe-perm true

# Run smoke tests
# Unlike other smoke tests, the .NET isolated smoke tests run outside of a docker container, but to race conditions
# when building the smoke test app in docker, causing the build to fail. This is a temporary workaround until the
# root cause is identified and fixed.

- name: Run smoke tests (Hello Cities)
shell: pwsh
run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 &
cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 &
./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/StartHelloCitiesTyped

- name: Run smoke tests (Process Exit)
shell: pwsh
run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 &
./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartProcessExitOrchestrator

- name: Run smoke tests (Timeout)
shell: pwsh
run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 &
cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 &
./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartTimeoutOrchestrator

- name: Run smoke tests (OOM)
shell: pwsh
run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 &
cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 &
./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartOOMOrchestrator
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,31 @@ internal void SetResult(string orchestratorResponseJsonText)
this.SetResultInternal(result);
}

private void ThrowIfPlatformLevelException(FailureDetails failureDetails)
{
// Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected.
//
// Today, this method only checks for <see cref="OutOfMemoryException"/>. In the future, we may want to add more cases.
// Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield
// a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be
// handled in this method.
// However, our tests reveal that OOMs are, surprisngly, caught and returned as a `OrchestratorExecutionResult`
// by the isolated process, and thus need special handling.
//
// It's unclear if all OOMs are caught by the isolated process (probably not), and also if there are other platform-level
// errors that are also caught in the isolated process and returned as a `OrchestratorExecutionResult`. Let's add them
// to this method as we encounter them.
if (failureDetails.InnerFailure?.IsCausedBy<OutOfMemoryException>() ?? false)
{
throw new SessionAbortedException(failureDetails.ErrorMessage);
}

if (failureDetails.InnerFailure != null)
{
this.ThrowIfPlatformLevelException(failureDetails.InnerFailure);
}
}

private void SetResultInternal(OrchestratorExecutionResult result)
{
// Look for an orchestration completion action to see if we need to grab the output.
Expand All @@ -133,6 +158,14 @@ private void SetResultInternal(OrchestratorExecutionResult result)

if (completeAction.OrchestrationStatus == OrchestrationStatus.Failed)
{
// If the orchestrator failed due to a platform-level error in the isolated process,
// we should re-throw that exception in the host (this process) invocation pipeline,
// so the invocation can be retried.
if (completeAction.FailureDetails != null)
{
this.ThrowIfPlatformLevelException(completeAction.FailureDetails);
}

string message = completeAction switch
{
{ FailureDetails: { } f } => f.ErrorMessage,
Expand Down
23 changes: 20 additions & 3 deletions src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,15 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync(

byte[] triggerReturnValueBytes = Convert.FromBase64String(triggerReturnValue);
P.OrchestratorResponse response = P.OrchestratorResponse.Parser.ParseFrom(triggerReturnValueBytes);

// TrySetResult may throw if a platform-level error is encountered (like an out of memory exception).
context.SetResult(
response.Actions.Select(ProtobufUtils.ToOrchestratorAction),
response.CustomStatus);

// Here we throw if the orchestrator completed with an application-level error. When we do this,
// the function's result type will be of type `OrchestrationFailureException` which is reserved
// for application-level errors that do not need to be re-tried.
context.ThrowIfFailed();
},
#pragma warning restore CS0618 // Type or member is obsolete (not intended for general public use)
Expand All @@ -159,6 +164,19 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync(
// Re-throw so we can abort this invocation.
this.HostLifetimeService.OnStopping.ThrowIfCancellationRequested();
}

// we abort the invocation on "platform level errors" such as:
// - a timeout
// - an out of memory exception
// - a worker process exit
if (functionResult.Exception is Host.FunctionTimeoutException
|| functionResult.Exception?.InnerException is SessionAbortedException // see RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling
|| (functionResult.Exception?.InnerException?.GetType().ToString().Contains("WorkerProcessExitException") ?? false))
{
// TODO: the `WorkerProcessExitException` type is not exposed in our dependencies, it's part of WebJobs.Host.Script.
// Should we add that dependency or should it be exposed in WebJobs.Host?
throw functionResult.Exception;
}
}
catch (Exception hostRuntimeException)
{
Expand Down Expand Up @@ -238,8 +256,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync(
else
{
// the function failed for some other reason

string exceptionDetails = functionResult.Exception.ToString();
string exceptionDetails = functionResult.Exception?.ToString() ?? "Framework-internal message: exception details could not be extracted";

this.TraceHelper.FunctionFailed(
this.Options.HubName,
Expand All @@ -258,7 +275,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync(

orchestratorResult = OrchestratorExecutionResult.ForFailure(
message: $"Function '{functionName}' failed with an unhandled exception.",
functionResult.Exception);
functionResult.Exception ?? new Exception($"Function '{functionName}' failed with an unknown unhandled exception"));
}

// Send the result of the orchestrator function to the DTFx dispatch pipeline.
Expand Down
25 changes: 25 additions & 0 deletions test/SmokeTests/OOProcSmokeTests/DotNetIsolated/DotNetIsolated.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.5.002.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DotNetIsolated", "DotNetIsolated.csproj", "{B2DBA49D-9D25-46DB-8968-15D5E83B4060}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.Build.0 = Debug|Any CPU
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.ActiveCfg = Release|Any CPU
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {0954D7B4-582F-4F85-AE3E-5D503FB07DB1}
EndGlobalSection
EndGlobal
165 changes: 165 additions & 0 deletions test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
using Microsoft.Azure.Functions.Worker;
using Microsoft.Azure.Functions.Worker.Http;
using Microsoft.DurableTask;
using Microsoft.DurableTask.Client;
using Microsoft.Extensions.Logging;
using System;

namespace FaultOrchestrators
{
public static class FaultyOrchestrators
{
[Function(nameof(OOMOrchestrator))]
public static Task OOMOrchestrator(
[OrchestrationTrigger] TaskOrchestrationContext context)
{
// this orchestrator is not deterministic, on purpose.
// we use the non-determinism to force an OOM exception on only the first replay

// check if a file named "replayEvidence" exists in source code directory, create it if it does not.
// From experience, this code runs in `<sourceCodePath>/bin/output/`, so we store the file two directories above.
// We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically
// when `func host start` is re-invoked.
string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence");
bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile);
if (isTheFirstReplay)
{
System.IO.File.Create(evidenceFile).Close();

// force the process to run out of memory
List<byte[]> data = new List<byte[]>();

for (int i = 0; i < 10000000; i++)
{
data.Add(new byte[1024 * 1024 * 1024]);
}

// we expect the code to never reach this statement, it should OOM.
// we throw just in case the code does not time out. This should fail the test
throw new Exception("this should never be reached");
}
else {
// if it's not the first replay, delete the evidence file and return
System.IO.File.Delete(evidenceFile);
return Task.CompletedTask;
}
}

[Function(nameof(ProcessExitOrchestrator))]
public static Task ProcessExitOrchestrator(
[OrchestrationTrigger] TaskOrchestrationContext context)
{
// this orchestrator is not deterministic, on purpose.
// we use the non-determinism to force a sudden process exit on only the first replay

// check if a file named "replayEvidence" exists in source code directory, create it if it does not.
// From experience, this code runs in `<sourceCodePath>/bin/output/`, so we store the file two directories above.
// We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically
// when `func host start` is re-invoked.
string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence");
bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile);
if (isTheFirstReplay)
{
System.IO.File.Create(evidenceFile).Close();

// force sudden crash
Environment.FailFast("Simulating crash!");
throw new Exception("this should never be reached");
}
else {
// if it's not the first replay, delete the evidence file and return
System.IO.File.Delete(evidenceFile);
return Task.CompletedTask;
}
}

[Function(nameof(TimeoutOrchestrator))]
public static Task TimeoutOrchestrator(
[OrchestrationTrigger] TaskOrchestrationContext context)
{
// this orchestrator is not deterministic, on purpose.
// we use the non-determinism to force a timeout on only the first replay

// check if a file named "replayEvidence" exists in source code directory, create it if it does not.
// From experience, this code runs in `<sourceCodePath>/bin/output/`, so we store the file two directories above.
// We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically
// when `func host start` is re-invoked.
string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence");
bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile);

if (isTheFirstReplay)
{
System.IO.File.Create(evidenceFile).Close();

// force the process to timeout after a 1 minute wait
System.Threading.Thread.Sleep(TimeSpan.FromMinutes(1));

// we expect the code to never reach this statement, it should time out.
// we throw just in case the code does not time out. This should fail the test
throw new Exception("this should never be reached");
}
else {
// if it's not the first replay, delete the evidence file and return
System.IO.File.Delete(evidenceFile);
return Task.CompletedTask;
}
}

[Function("durable_HttpStartOOMOrchestrator")]
public static async Task<HttpResponseData> HttpStartOOMOrchestrator(
[HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req,
[DurableClient] DurableTaskClient client,
FunctionContext executionContext)
{
ILogger logger = executionContext.GetLogger("durable_HttpStartOOMOrchestrator");

// Function input comes from the request content.
string instanceId = await client.ScheduleNewOrchestrationInstanceAsync(
nameof(OOMOrchestrator));

logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId);

// Returns an HTTP 202 response with an instance management payload.
// See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration
return await client.CreateCheckStatusResponseAsync(req, instanceId);
}

[Function("durable_HttpStartProcessExitOrchestrator")]
public static async Task<HttpResponseData> HttpStartProcessExitOrchestrator(
[HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req,
[DurableClient] DurableTaskClient client,
FunctionContext executionContext)
{
ILogger logger = executionContext.GetLogger("durable_HttpStartProcessExitOrchestrator");

// Function input comes from the request content.
string instanceId = await client.ScheduleNewOrchestrationInstanceAsync(
nameof(ProcessExitOrchestrator));

logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId);

// Returns an HTTP 202 response with an instance management payload.
// See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration
return await client.CreateCheckStatusResponseAsync(req, instanceId);
}

[Function("durable_HttpStartTimeoutOrchestrator")]
public static async Task<HttpResponseData> HttpStartTimeoutOrchestrator(
[HttpTrigger(AuthorizationLevel.Anonymous, "get", "post")] HttpRequestData req,
[DurableClient] DurableTaskClient client,
FunctionContext executionContext)
{
ILogger logger = executionContext.GetLogger("durable_HttpStartTimeoutOrchestrator");

// Function input comes from the request content.
string instanceId = await client.ScheduleNewOrchestrationInstanceAsync(
nameof(TimeoutOrchestrator));

logger.LogInformation("Started orchestration with ID = '{instanceId}'.", instanceId);

// Returns an HTTP 202 response with an instance management payload.
// See https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-http-api#start-orchestration
return await client.CreateCheckStatusResponseAsync(req, instanceId);
}
}
}
11 changes: 10 additions & 1 deletion test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,14 @@
"excludedTypes": "Request"
}
}
}
},
"extensions": {
"durableTask": {
"storageProvider": {
"maxQueuePollingInterval": "00:00:01",
"controlQueueVisibilityTimeout": "00:01:00"
}
}
},
"functionTimeout": "00:00:30"
}
Loading

0 comments on commit 8a5db71

Please sign in to comment.