diff --git a/.github/workflows/smoketest-dotnet-isolated-v4.yml b/.github/workflows/smoketest-dotnet-isolated-v4.yml index 37773e62c..474f48448 100644 --- a/.github/workflows/smoketest-dotnet-isolated-v4.yml +++ b/.github/workflows/smoketest-dotnet-isolated-v4.yml @@ -69,31 +69,29 @@ jobs: run: npm i -g azure-functions-core-tools@4 --unsafe-perm true # Run smoke tests - # - name: Run smoke tests (Hello Cities) - # shell: pwsh - # run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & - # cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & - # ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/StartHelloCitiesTyped - - # - name: Run smoke tests (Timeout) - # shell: pwsh - # run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & - # cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & - # ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartTimeoutOrchestrator - - # - name: Run smoke tests (OOM) - # shell: pwsh - # run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & - # cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & - # ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartOOMOrchestrator + # Unlike other smoke tests, the .NET isolated smoke tests run outside of a docker container, but to race conditions + # when building the smoke test app in docker, causing the build to fail. This is a temporary workaround until the + # root cause is identified and fixed. + + - name: Run smoke tests (Hello Cities) + shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/StartHelloCitiesTyped - name: Run smoke tests (Process Exit) shell: pwsh run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartProcessExitOrchestrator + - name: Run smoke tests (Timeout) + shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartTimeoutOrchestrator - # # Validation is blocked on https://github.com/Azure/azure-functions-host/issues/7995 - # - name: Run V4 .NET Isolated Smoke Test - # run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/StartHelloCitiesTyped - # shell: pwsh \ No newline at end of file + - name: Run smoke tests (OOM) + shell: pwsh + run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 & + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & + ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartOOMOrchestrator \ No newline at end of file diff --git a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs index 3ebecbc3d..d50e5657f 100644 --- a/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs +++ b/src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs @@ -171,7 +171,7 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( // - a worker process exit if (functionResult.Exception is Host.FunctionTimeoutException || functionResult.Exception?.InnerException is OutOfMemoryException // see RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling - || (functionResult.Exception?.InnerException?.GetType().ToString().Contains("WorkerProcessExitExceptionn") ?? false)) + || (functionResult.Exception?.InnerException?.GetType().ToString().Contains("WorkerProcessExitException") ?? false)) { // TODO: the `WorkerProcessExitException` type is not exposed in our dependencies, it's part of WebJobs.Host.Script. // Should we add that dependency or should it be exposed in WebJobs.Host? diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs index dd7327f94..90006fd20 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/FaultyOrchestrators.cs @@ -16,18 +16,16 @@ public static Task OOMOrchestrator( // this orchestrator is not deterministic, on purpose. // we use the non-determinism to force an OOM exception on only the first replay - // check if a file named "replayEvidence" exists in the current directory. - // create it if it does not - string evidenceFile = "replayEvidence"; + // check if a file named "replayEvidence" exists in source code directory, create it if it does not. + // From experience, this code runs in `/bin/output/`, so we store the file two directories above. + // We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically + // when `func host start` is re-invoked. + string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); if (isTheFirstReplay) { System.IO.File.Create(evidenceFile).Close(); - } - - // on the very first replay, OOM the process - if (isTheFirstReplay) - { + // force the process to run out of memory List data = new List(); @@ -49,20 +47,18 @@ public static Task ProcessExitOrchestrator( // this orchestrator is not deterministic, on purpose. // we use the non-determinism to force a sudden process exit on only the first replay - // check if a file named "replayEvidence" exists in the current directory. - // create it if it does not - string evidenceFile = "replayEvidence"; + // check if a file named "replayEvidence" exists in source code directory, create it if it does not. + // From experience, this code runs in `/bin/output/`, so we store the file two directories above. + // We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically + // when `func host start` is re-invoked. + string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); if (isTheFirstReplay) { System.IO.File.Create(evidenceFile).Close(); - } - - // on the very first replay, OOM the process - if (isTheFirstReplay) - { - // simulate out of memory - throw new OutOfMemoryException(); + + // simulate sudden crash + Environment.FailFast("Simulating crash!"); } // assuming the orchestrator survived the OOM, delete the evidence file and return @@ -78,18 +74,17 @@ public static Task TimeoutOrchestrator( // this orchestrator is not deterministic, on purpose. // we use the non-determinism to force a timeout on only the first replay - // check if a file named "replayEvidence" exists in the current directory. - // create it if it does not - string evidenceFile = "replayEvidence"; + // check if a file named "replayEvidence" exists in source code directory, create it if it does not. + // From experience, this code runs in `/bin/output/`, so we store the file two directories above. + // We do this because the /bin/output/ directory gets overridden during the build process, which happens automatically + // when `func host start` is re-invoked. + string evidenceFile = System.IO.Path.Combine(System.IO.Directory.GetCurrentDirectory(), "..", "..", "replayEvidence"); bool isTheFirstReplay = !System.IO.File.Exists(evidenceFile); + if (isTheFirstReplay) { System.IO.File.Create(evidenceFile).Close(); - } - - // on the very first replay, time out the execution - if (isTheFirstReplay) - { + // force the process to timeout after a 1 minute wait System.Threading.Thread.Sleep(TimeSpan.FromMinutes(1)); } diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json index 68e9f689b..ae6c359f7 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/host.json @@ -10,8 +10,10 @@ }, "extensions": { "durableTask": { + "hubName": "hubbbb1113324", "storageProvider": { - "maxQueuePollingInterval": "00:00:01" + "maxQueuePollingInterval": "00:00:01", + "controlQueueVisibilityTimeout": "00:01:00" } } }, diff --git a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 index a90dda4f4..839cdea5d 100644 --- a/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 +++ b/test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 @@ -1,6 +1,11 @@ -# This is a simple smoke test runner to validate the .NET isolated smoke tests. -# It supercedes the usual e2e-tests.ps1 script for the .NET isolated scenario because building the tests on the docker image -# is unreliable. For more details, see: https://github.com/Azure/azure-functions-host/issues/7995 +# This is a simple test runner to validate the .NET isolated smoke tests. +# It supercedes the usual e2e-tests.ps1 script for the .NET isolated scenario because building the snmoke test app +# on the docker image is unreliable. For more details, see: https://github.com/Azure/azure-functions-host/issues/7995 + +# This script is designed specifically to test cases where the isolated worker process experiences a platform failure: +# timeouts, OOMs, etc. For that reason, it is careful to check that the Functions Host is running and healthy at regular +# intervals. This makes these tests run more slowly than other test categories. + param( [Parameter(Mandatory=$true)] [string]$HttpStartPath @@ -9,19 +14,20 @@ param( $retryCount = 0; $statusUrl = $null; $success = $false; +$haveManuallyRestartedHost = $false; Do { $testIsRunning = $true; # Start the functions host if it's not running already. - # Then give it 40 seconds to start up. This is a long wait, but from experience the CI can be slow to start up the host. + # Then give it up to 1 minute to start up. + # This is a long wait, but from experience the CI can be slow to start up the host, especially after a platform-error. $isFunctionsHostRunning = (Get-Process -Name func -ErrorAction SilentlyContinue) if ($isFunctionsHostRunning -eq $null) { Write-Host "Starting the Functions host..." -ForegroundColor Yellow # The '&' operator is used to run the command in the background - cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 >> Debug.txt & - + cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 & Write-Host "Waiting for the Functions host to start up..." -ForegroundColor Yellow Start-Sleep -Seconds 60 } @@ -34,7 +40,7 @@ Do { Invoke-RestMethod -Method Post -Uri "http://localhost:7071/admin/host/ping" Write-Host "Host is healthy!" -ForegroundColor Green - # Note that any HTTP protocol errors (e.g. HTTP 4xx or 5xx) will cause an immediate failure + # Start orchestrator if it hasn't been started yet if ($statusUrl -eq $null){ $startOrchestrationUri = "http://localhost:7071/$HttpStartPath" Write-Host "Starting a new orchestration instance via POST to $startOrchestrationUri..." -ForegroundColor Yellow @@ -44,10 +50,15 @@ Do { Write-Host "Waiting for orchestration to complete..." -ForegroundColor Yellow $statusUrl = $result.statusQueryGetUri + + # sleep for a bit to give the orchestrator a chance to start, + # then loop once more in case the orchestrator ran quickly, made the host unhealthy, + # and the functions host needs to be restarted + Start-Sleep -Seconds 5 continue; } - + # Check the orchestrator status $result = Invoke-RestMethod -Method Get -Uri $statusUrl $runtimeStatus = $result.runtimeStatus Write-Host "Orchestration is $runtimeStatus" -ForegroundColor Yellow @@ -58,29 +69,47 @@ Do { $testIsRunning = $false break } + if ($result.runtimeStatus -eq "Failed") { + $success = $false + $testIsRunning = $false + break + } + # If the orchestrator did not complete yet, wait for a bit before checking again Start-Sleep -Seconds 2 $retryCount = $retryCount + 1 } catch { Write-Host "An error occurred:" -ForegroundColor Red Write-Host $_ -ForegroundColor Red - $output = cat ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Debug.txt - Write-Host $output - Write-Host "======================================" -ForegroundColor Red + + # When testing for platform errors, we want to make sure the Functions host is healthy and ready to take requests. + # The Host can get into bad states (for example, in an OOM-inducing test) where it does not self-heal. + # For these cases, we manually restart the host to ensure it is in a good state. We only do this once per test. + if ($haveManuallyRestartedHost -eq $false) { + + # We stop the host process and wait for a bit before checking if it is running again. + Write-Host "Restarting the Functions host..." -ForegroundColor Yellow + Stop-Process -Name "func" -Force + Start-Sleep -Seconds 5 + + # Log whether the process kill succeeded + $haveManuallyRestartedHost = $true + $isFunctionsHostRunning = ((Get-Process -Name func -ErrorAction SilentlyContinue) -eq $null) + Write-Host "Host process killed: $isFunctionsHostRunning" -ForegroundColor Yellow + + # the beginning of the loop will restart the host + continue + } # Rethrow the original exception throw } - - # This delay a bit excessive, but we want to make sure the Functions runtime is up and running before we start the orchestration - # This number was determined, through trial and error, to be a safe amount of time to wait - Start-Sleep -Seconds 40 } while (($testIsRunning -eq $true) -and ($retryCount -lt 65)) if ($success -eq $false) { - throw "Orchestration didn't complete in time! :(" + throw "Orchestration failed or did not compete in time! :(" } Write-Host "Success!" -ForegroundColor Green \ No newline at end of file