Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minimize telemetry surface area #2844

Merged
merged 33 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
94a1f2c
first draft
davidmrdavid Jun 4, 2024
a56cf6d
remove unecessary diff
davidmrdavid Jun 10, 2024
ffdd81c
refactor
davidmrdavid Jun 10, 2024
17dceeb
refactor
davidmrdavid Jun 10, 2024
ba345d1
Refactor
davidmrdavid Jun 11, 2024
77ebaa6
refactor
davidmrdavid Jun 11, 2024
e1e865c
remove unecessary diff
davidmrdavid Jun 11, 2024
6bd3a33
add nullable checks
davidmrdavid Jun 11, 2024
95bcd87
add nullable checks in E2ETraceHelper
davidmrdavid Jun 11, 2024
dbd62d2
remove whitespace
davidmrdavid Jun 11, 2024
1ea40e3
add nullability checks
davidmrdavid Jun 11, 2024
a43292c
clean up
davidmrdavid Jun 11, 2024
e4fab7a
refactorings
davidmrdavid Jun 11, 2024
39f400e
refactorings
davidmrdavid Jun 11, 2024
80f6a09
add comments in csproj
davidmrdavid Jun 11, 2024
d3103fe
quick test
davidmrdavid Jun 13, 2024
1fe4e67
remove nullable assignment
davidmrdavid Jun 13, 2024
caf746e
small edit
davidmrdavid Jun 13, 2024
10ac5a9
minor refactor
davidmrdavid Jun 13, 2024
900b1a2
remove line
davidmrdavid Jun 13, 2024
032f566
remove line
davidmrdavid Jun 13, 2024
f31cad9
null string handling
davidmrdavid Jun 13, 2024
43f63fc
apply feedback
davidmrdavid Jun 17, 2024
f30f622
Merge branch 'dev' into dajusto/remove-potentially-sensitive-logs
davidmrdavid Jun 24, 2024
b1ec95d
remove replay controls
davidmrdavid Jul 3, 2024
de2e41c
Merge branch 'dajusto/remove-potentially-sensitive-logs' of https://g…
davidmrdavid Jul 3, 2024
73fed6e
add unit tests
davidmrdavid Jul 3, 2024
e16b912
Add unit tests
davidmrdavid Jul 3, 2024
8ed5d30
clean up tests
davidmrdavid Jul 3, 2024
5d15568
pass tests
davidmrdavid Jul 8, 2024
778cecd
increase timeout
davidmrdavid Jul 8, 2024
bac3b33
pass linter
davidmrdavid Jul 9, 2024
2af6b47
Merge branch 'dev' of https://github.com/Azure/azure-functions-durabl…
davidmrdavid Jul 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,12 @@ bool IDurableEntityContext.HasState
public void CaptureInternalError(Exception e, TaskEntityShim shim)
{
// first, try to get a quick ETW message out to help us diagnose what happened
string details = Utils.IsFatal(e) ? e.GetType().Name : e.ToString();
this.Config.TraceHelper.EntityBatchFailed(
this.HubName,
this.Name,
this.InstanceId,
shim.TraceFlags,
details);
e);

// then, record the error for additional reporting and tracking in other places
this.InternalError = ExceptionDispatchInfo.Capture(e);
Expand Down Expand Up @@ -180,22 +179,27 @@ public void ThrowApplicationExceptionsIfAny()
}
}

public bool ErrorsPresent(out string description)
public bool ErrorsPresent(out string error, out string sanitizedError)
{
if (this.InternalError != null)
{
description = $"Internal error: {this.InternalError.SourceException}";
error = $"Internal error: {this.InternalError.SourceException}";
sanitizedError = $"Internal error: {this.InternalError.SourceException.GetType().FullName} \n {this.InternalError.SourceException.StackTrace}";
return true;
}
else if (this.ApplicationErrors != null)
{
var messages = this.ApplicationErrors.Select(i => $"({i.SourceException.Message})");
description = $"One or more operations failed: {string.Concat(messages)}";
error = $"One or more operations failed: {string.Concat(messages)}";

string errorTypes = string.Join(", ", this.ApplicationErrors.Select(i => i.SourceException.GetType().FullName));
sanitizedError = $"One or more operations failed: {errorTypes}";
return true;
}
else
{
description = string.Empty;
error = string.Empty;
sanitizedError = string.Empty;
return false;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,7 @@ internal async Task<TResult> CallDurableTaskFunctionAsync<TResult>(
operationName,
input: "(replayed)",
exception: "(replayed)",
sanitizedException: string.Empty,
duration: 0,
isReplay: true);
}
Expand All @@ -792,6 +793,7 @@ internal async Task<TResult> CallDurableTaskFunctionAsync<TResult>(
functionName,
this.InstanceId,
reason: $"(replayed {exception.GetType().Name})",
sanitizedReason: string.Empty,
functionType: functionType,
isReplay: true);
}
Expand Down Expand Up @@ -933,7 +935,6 @@ internal void RaiseEvent(string name, string input)
FunctionType.Orchestrator,
this.InstanceId,
name,
this.Config.GetIntputOutputTrace(responseMessage.Result),
this.IsReplaying);
}
else
Expand All @@ -943,7 +944,6 @@ internal void RaiseEvent(string name, string input)
this.Name,
this.InstanceId,
name,
this.Config.GetIntputOutputTrace(input),
this.IsReplaying);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,18 +64,11 @@ internal OrchestratorExecutionResult GetResult()
return this.executionResult ?? throw new InvalidOperationException($"The execution result has not yet been set using {nameof(this.SetResult)}.");
}

internal bool TryGetOrchestrationErrorDetails(out string details)
internal bool TryGetOrchestrationErrorDetails(out Exception? failure)
{
if (this.failure != null)
{
details = this.failure.Message;
return true;
}
else
{
details = string.Empty;
return false;
}
bool hasError = this.failure != null;
failure = hasError ? this.failure : null;
return hasError;
}

internal void SetResult(IEnumerable<OrchestratorAction> actions, string customStatus)
Expand Down
3 changes: 2 additions & 1 deletion src/WebJobs.Extensions.DurableTask/DurableTaskExtension.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1063,13 +1063,14 @@ private async Task EntityMiddleware(DispatchMiddlewareContext dispatchContext, F
await next();

// 5. If there were internal or application errors, trace them for DF
if (entityContext.ErrorsPresent(out var description))
if (entityContext.ErrorsPresent(out string description, out string sanitizedError))
{
this.TraceHelper.FunctionFailed(
entityContext.HubName,
entityContext.Name,
entityContext.InstanceId,
description,
sanitizedReason: sanitizedError,
functionType: FunctionType.Entity,
isReplay: false);
}
Expand Down
72 changes: 36 additions & 36 deletions src/WebJobs.Extensions.DurableTask/EndToEndTraceHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System;
using System.Diagnostics;
using System.Net;
using DurableTask.Core.Common;
using Microsoft.Extensions.Logging;

namespace Microsoft.Azure.WebJobs.Extensions.DurableTask
Expand Down Expand Up @@ -140,7 +141,6 @@ public void FunctionStarting(
functionName,
taskEventId,
instanceId,
input,
functionType.ToString(),
ExtensionVersion,
isReplay);
Expand Down Expand Up @@ -226,7 +226,6 @@ public void FunctionCompleted(
functionName,
taskEventId,
instanceId,
output,
continuedAsNew,
functionType.ToString(),
ExtensionVersion,
Expand All @@ -239,26 +238,6 @@ public void FunctionCompleted(
}
}

public void ProcessingOutOfProcPayload(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this wasn't used

string functionName,
string taskHub,
string instanceId,
string details)
{
EtwEventSource.Instance.ProcessingOutOfProcPayload(
functionName,
taskHub,
LocalAppName,
LocalSlotName,
instanceId,
details,
ExtensionVersion);

this.logger.LogDebug(
"{instanceId}: Function '{functionName} ({functionType})' returned the following OOProc orchestration state: {details}. : {hubName}. AppName: {appName}. SlotName: {slotName}. ExtensionVersion: {extensionVersion}. SequenceNumber: {sequenceNumber}.",
instanceId, functionName, FunctionType.Orchestrator, details, taskHub, LocalAppName, LocalSlotName, ExtensionVersion, this.sequenceNumber++);
}

public void FunctionTerminated(
string hubName,
string functionName,
Expand All @@ -273,7 +252,6 @@ public void FunctionTerminated(
LocalSlotName,
functionName,
instanceId,
reason,
functionType.ToString(),
ExtensionVersion,
IsReplay: false);
Expand All @@ -298,7 +276,6 @@ public void SuspendingOrchestration(
LocalSlotName,
functionName,
instanceId,
reason,
functionType.ToString(),
ExtensionVersion,
IsReplay: false);
Expand All @@ -323,7 +300,6 @@ public void ResumingOrchestration(
LocalSlotName,
functionName,
instanceId,
reason,
functionType.ToString(),
ExtensionVersion,
IsReplay: false);
Expand All @@ -348,7 +324,6 @@ public void FunctionRewound(
LocalSlotName,
functionName,
instanceId,
reason,
functionType.ToString(),
ExtensionVersion,
IsReplay: false);
Expand All @@ -364,6 +339,7 @@ public void FunctionFailed(
string functionName,
string instanceId,
string reason,
string sanitizedReason,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that sanitizedReason is going to ETW while reason is going to ILogger, but don't we send both of these payloads to Kusto, ETW to DurableFunctionsEvents and ILogger to FunctionsLogs? I know that's not the case for the DTFx tracing code, but I thought the WebJobs ILogger logs went to FunctionsLogs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right. Throughout this PR, we only sanitize the logs to DurableFunctionsEvents and not the ILogger-powered logs to FunctionsLogs. This is because the ILogger logs are also sent to the user's Application Insights instance, and I was worried about changing the logging behavior to a user-facing component.

So I'm choosing to delay our decision to sanitize the FunctionsLogs for now so that we can unblock sanitizing the DF Kusto table. Does that seem reasonable?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, thanks for clarifying. It would be good to confirm what the plan is for sanitizing FunctionsLogs just to make sure that the work done in this PR isn't redundant or need to be reversed. Happy to chat about this offline.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's discuss this offline. But in general I think the clean up done here is unlikely to need to be reversed. If anything, I think it's more likely that it will need to be expanded to include FunctionsLogs as well. I'm just opting to merge a minimal improvement for now.

FunctionType functionType,
bool isReplay,
int taskEventId = -1)
Expand All @@ -377,7 +353,7 @@ public void FunctionFailed(
functionName,
taskEventId,
instanceId,
reason,
sanitizedReason,
functionType.ToString(),
ExtensionVersion,
isReplay);
Expand All @@ -389,6 +365,19 @@ public void FunctionFailed(
}
}

public void FunctionFailed(
string hubName,
string functionName,
string instanceId,
Exception reason,
FunctionType functionType,
bool isReplay,
int taskEventId = -1)
{
string sanitizedException = $"{reason.GetType().FullName}\n{reason.StackTrace}";
this.FunctionFailed(hubName, functionName, instanceId, reason.ToString(), sanitizedException, functionType, isReplay, taskEventId);
}

public void FunctionAborted(
string hubName,
string functionName,
Expand Down Expand Up @@ -434,8 +423,6 @@ public void OperationCompleted(
instanceId,
operationId,
operationName,
input,
output,
duration,
FunctionType.Entity.ToString(),
ExtensionVersion,
Expand All @@ -456,6 +443,7 @@ public void OperationFailed(
string operationName,
string input,
string exception,
string sanitizedException,
double duration,
bool isReplay)
{
Expand All @@ -469,7 +457,6 @@ public void OperationFailed(
instanceId,
operationId,
operationName,
input,
exception,
duration,
FunctionType.Entity.ToString(),
Expand All @@ -483,12 +470,26 @@ public void OperationFailed(
}
}

public void OperationFailed(
string hubName,
string functionName,
string instanceId,
string operationId,
string operationName,
string input,
Exception exception,
double duration,
bool isReplay)
{
string sanitizedException = $"{exception.GetType().FullName}\n{exception.StackTrace}";
this.OperationFailed(hubName, functionName, instanceId, operationId, operationName, input, exception.ToString(), sanitizedException, duration, isReplay);
}

public void ExternalEventRaised(
string hubName,
string functionName,
string instanceId,
string eventName,
string input,
bool isReplay)
{
if (this.ShouldLogEvent(isReplay))
Expand All @@ -502,7 +503,6 @@ public void ExternalEventRaised(
functionName,
instanceId,
eventName,
input,
functionType.ToString(),
ExtensionVersion,
isReplay);
Expand Down Expand Up @@ -605,7 +605,6 @@ public void EntityResponseReceived(
FunctionType functionType,
string instanceId,
string operationId,
string result,
bool isReplay)
{
if (this.ShouldLogEvent(isReplay))
Expand All @@ -617,7 +616,6 @@ public void EntityResponseReceived(
functionName,
instanceId,
operationId,
result,
functionType.ToString(),
ExtensionVersion,
isReplay);
Expand Down Expand Up @@ -806,9 +804,11 @@ public void EntityBatchFailed(
string functionName,
string instanceId,
string traceFlags,
string details)
Exception error)
{
FunctionType functionType = FunctionType.Entity;
string details = Utils.IsFatal(error) ? error.GetType().Name : error.ToString();
string sanitizedDetails = $"{error.GetType().FullName}\n{error.StackTrace}";

EtwEventSource.Instance.EntityBatchFailed(
hubName,
Expand All @@ -817,7 +817,7 @@ public void EntityBatchFailed(
functionName,
instanceId,
traceFlags,
details,
sanitizedDetails,
functionType.ToString(),
ExtensionVersion);

Expand Down
Loading
Loading