Building resilient Azure Functions that gracefully handle failures and recover from errors is essential for production applications. Let’s explore comprehensive error handling patterns, retry strategies, and resilience techniques that will make your serverless applications bulletproof.
Understanding Azure Functions Error Types
Before implementing error handling, it’s crucial to understand the different types of errors you’ll encounter:
- Transient Errors: Network timeouts, temporary service unavailability
- Permanent Errors: Invalid input, authentication failures, business logic violations
- Infrastructure Errors: Function host issues, platform problems
- Dependency Errors: Database connection failures, external API errors
Comprehensive Error Handling Framework
// Custom exception types for different error scenarios
public class TransientException : Exception
{
public TransientException(string message, Exception innerException = null)
: base(message, innerException) { }
}
public class PermanentException : Exception
{
public PermanentException(string message, Exception innerException = null)
: base(message, innerException) { }
}
public class BusinessLogicException : Exception
{
public string ErrorCode { get; }
public object Details { get; }
public BusinessLogicException(string errorCode, string message, object details = null)
: base(message)
{
ErrorCode = errorCode;
Details = details;
}
}
// Error handling wrapper
public static class ErrorHandler
{
[FunctionName("ResilientFunction")]
public static async Task<IActionResult> Run(
[HttpTrigger(AuthorizationLevel.Function, "post")] HttpRequest req,
ILogger log)
{
return await ExecuteWithErrorHandlingAsync(async () =>
{
var request = await DeserializeRequestAsync<ProcessingRequest>(req);
var result = await ProcessDataAsync(request);
return new OkObjectResult(result);
}, log);
}
private static async Task<IActionResult> ExecuteWithErrorHandlingAsync(
Func<Task<IActionResult>> operation,
ILogger log)
{
try
{
return await operation();
}
catch (BusinessLogicException ex)
{
log.LogWarning(ex, "Business logic error: {ErrorCode}", ex.ErrorCode);
return new BadRequestObjectResult(new
{
error = ex.ErrorCode,
message = ex.Message,
details = ex.Details
});
}
catch (PermanentException ex)
{
log.LogError(ex, "Permanent error occurred");
return new BadRequestObjectResult(new
{
error = "PERMANENT_ERROR",
message = ex.Message
});
}
catch (TransientException ex)
{
log.LogWarning(ex, "Transient error occurred - operation may be retried");
return new StatusCodeResult(503); // Service Unavailable - can retry
}
catch (Exception ex)
{
log.LogError(ex, "Unexpected error occurred");
return new StatusCodeResult(500);
}
}
}
Retry Patterns with Exponential Backoff
// Advanced retry mechanism with different strategies
public static class RetryHelper
{
public static async Task<T> ExecuteWithRetryAsync<T>(
Func<Task<T>> operation,
RetryPolicy policy,
ILogger log)
{
var attempt = 0;
var exceptions = new List<Exception>();
while (attempt < policy.MaxAttempts)
{
try
{
log.LogInformation("Executing operation, attempt {Attempt}", attempt + 1);
return await operation();
}
catch (Exception ex) when (ShouldRetry(ex, policy))
{
attempt++;
exceptions.Add(ex);
if (attempt >= policy.MaxAttempts)
{
log.LogError("Operation failed after {MaxAttempts} attempts", policy.MaxAttempts);
throw new AggregateException(exceptions);
}
var delay = CalculateDelay(attempt, policy);
log.LogWarning(ex, "Operation failed on attempt {Attempt}, retrying in {Delay}ms",
attempt, delay.TotalMilliseconds);
await Task.Delay(delay);
}
}
throw new InvalidOperationException("Should not reach this point");
}
private static bool ShouldRetry(Exception ex, RetryPolicy policy)
{
// Don't retry permanent errors
if (ex is PermanentException || ex is BusinessLogicException)
return false;
// Retry transient errors
if (ex is TransientException)
return true;
// Retry specific HTTP status codes
if (ex is HttpRequestException httpEx)
{
return policy.RetriableStatusCodes.Any(code =>
httpEx.Message.Contains(((int)code).ToString()));
}
return false;
}
private static TimeSpan CalculateDelay(int attempt, RetryPolicy policy)
{
return policy.Strategy switch
{
RetryStrategy.Linear => TimeSpan.FromMilliseconds(policy.BaseDelayMs),
RetryStrategy.Exponential => TimeSpan.FromMilliseconds(
policy.BaseDelayMs * Math.Pow(2, attempt - 1)),
RetryStrategy.ExponentialWithJitter => TimeSpan.FromMilliseconds(
policy.BaseDelayMs * Math.Pow(2, attempt - 1) * (0.5 + Random.Shared.NextDouble() * 0.5)),
_ => TimeSpan.FromMilliseconds(policy.BaseDelayMs)
};
}
}
public class RetryPolicy
{
public int MaxAttempts { get; set; } = 3;
public int BaseDelayMs { get; set; } = 1000;
public RetryStrategy Strategy { get; set; } = RetryStrategy.ExponentialWithJitter;
public HttpStatusCode[] RetriableStatusCodes { get; set; } =
{
HttpStatusCode.InternalServerError,
HttpStatusCode.BadGateway,
HttpStatusCode.ServiceUnavailable,
HttpStatusCode.GatewayTimeout,
HttpStatusCode.TooManyRequests
};
}
public enum RetryStrategy
{
Linear,
Exponential,
ExponentialWithJitter
}
Circuit Breaker Pattern
// Circuit breaker implementation for external dependencies
public class CircuitBreaker
{
private readonly int _failureThreshold;
private readonly TimeSpan _timeout;
private readonly ILogger _logger;
private int _failureCount = 0;
private DateTime _lastFailureTime = DateTime.MinValue;
private CircuitBreakerState _state = CircuitBreakerState.Closed;
private readonly object _lock = new object();
public CircuitBreaker(int failureThreshold, TimeSpan timeout, ILogger logger)
{
_failureThreshold = failureThreshold;
_timeout = timeout;
_logger = logger;
}
public async Task<T> ExecuteAsync<T>(Func<Task<T>> operation)
{
if (_state == CircuitBreakerState.Open)
{
if (DateTime.UtcNow - _lastFailureTime < _timeout)
{
throw new CircuitBreakerOpenException("Circuit breaker is open");
}
// Try to close the circuit
_state = CircuitBreakerState.HalfOpen;
_logger.LogInformation("Circuit breaker transitioning to half-open state");
}
try
{
var result = await operation();
if (_state == CircuitBreakerState.HalfOpen)
{
Reset();
}
return result;
}
catch (Exception ex)
{
RecordFailure();
throw;
}
}
private void RecordFailure()
{
lock (_lock)
{
_failureCount++;
_lastFailureTime = DateTime.UtcNow;
if (_failureCount >= _failureThreshold)
{
_state = CircuitBreakerState.Open;
_logger.LogWarning("Circuit breaker opened after {FailureCount} failures", _failureCount);
}
}
}
private void Reset()
{
lock (_lock)
{
_failureCount = 0;
_state = CircuitBreakerState.Closed;
_logger.LogInformation("Circuit breaker reset to closed state");
}
}
}
public enum CircuitBreakerState
{
Closed,
Open,
HalfOpen
}
Dead Letter Queue Implementation
// Dead letter queue handling for failed messages
public static class DeadLetterQueueHandler
{
[FunctionName("ProcessMessage")]
public static async Task ProcessMessage(
[ServiceBusTrigger("main-queue", Connection = "ServiceBusConnection")]
string message,
int deliveryCount,
ILogger log)
{
try
{
log.LogInformation("Processing message, delivery count: {DeliveryCount}", deliveryCount);
var messageData = JsonSerializer.Deserialize<MessageData>(message);
await ProcessMessageDataAsync(messageData);
log.LogInformation("Message processed successfully");
}
catch (TransientException ex)
{
log.LogWarning(ex, "Transient error processing message, will retry");
// Let Service Bus handle retry automatically
throw;
}
catch (PermanentException ex)
{
log.LogError(ex, "Permanent error processing message");
// This will go to dead letter queue after max retries
throw;
}
catch (Exception ex)
{
log.LogError(ex, "Unexpected error processing message");
// After max retries, it will go to dead letter queue automatically
throw;
}
}
[FunctionName("ProcessDeadLetterMessages")]
public static async Task ProcessDeadLetterMessages(
[ServiceBusTrigger("main-queue/$deadletterqueue", Connection = "ServiceBusConnection")]
string deadLetterMessage,
ILogger log)
{
log.LogWarning("Processing dead letter message");
// Store in failed messages table for investigation
await StoreFailedMessageAsync(new FailedMessage
{
Id = Guid.NewGuid(),
OriginalMessage = deadLetterMessage,
FailedAt = DateTime.UtcNow
});
// Send alert for critical messages
if (IsCriticalMessage(deadLetterMessage))
{
await SendFailureAlertAsync(deadLetterMessage);
}
}
}
Timeout Management
// Comprehensive timeout handling
public static class TimeoutFunction
{
[FunctionName("TimeoutManagedOperation")]
public static async Task<IActionResult> Run(
[HttpTrigger(AuthorizationLevel.Function, "post")] HttpRequest req,
ILogger log)
{
using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(5)); // Function timeout
try
{
var tasks = new[]
{
ProcessDataWithTimeoutAsync("Operation1", TimeSpan.FromSeconds(30), cts.Token),
ProcessDataWithTimeoutAsync("Operation2", TimeSpan.FromSeconds(45), cts.Token),
ProcessDataWithTimeoutAsync("Operation3", TimeSpan.FromSeconds(60), cts.Token)
};
// Wait for all operations or timeout
var results = await Task.WhenAll(tasks);
return new OkObjectResult(new { results, completedAt = DateTime.UtcNow });
}
catch (OperationCanceledException) when (cts.Token.IsCancellationRequested)
{
log.LogWarning("Operations timed out after 5 minutes");
return new StatusCodeResult(408); // Request Timeout
}
catch (Exception ex)
{
log.LogError(ex, "Operations failed");
return new StatusCodeResult(500);
}
}
private static async Task<string> ProcessDataWithTimeoutAsync(
string operationName,
TimeSpan timeout,
CancellationToken cancellationToken)
{
using var operationCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
operationCts.CancelAfter(timeout);
try
{
// Simulate work that respects cancellation
for (int i = 0; i < 100; i++)
{
operationCts.Token.ThrowIfCancellationRequested();
await Task.Delay(100, operationCts.Token); // Simulate work
}
return $"{operationName} completed successfully";
}
catch (OperationCanceledException) when (operationCts.Token.IsCancellationRequested)
{
throw new TimeoutException($"{operationName} timed out after {timeout.TotalSeconds} seconds");
}
}
}
Error Handling Best Practices Checklist
- ✅ Classify errors into transient, permanent, and business logic categories
- ✅ Implement retry logic with exponential backoff and jitter
- ✅ Use circuit breakers for external dependencies
- ✅ Handle dead letter queues for failed message processing
- ✅ Set appropriate timeouts for all operations
- ✅ Log errors with correlation IDs for traceability
- ✅ Monitor error rates and set up alerts
- ✅ Implement graceful degradation when possible
- ✅ Test failure scenarios regularly
- ✅ Use structured logging for better analysis
Conclusion
Building resilient Azure Functions requires a comprehensive approach to error handling and recovery. By implementing these patterns – retry logic, circuit breakers, dead letter queues, and proper monitoring – you can create serverless applications that gracefully handle failures and maintain high availability.
Remember that resilience is not just about handling errors when they occur, but also about designing your system to fail gracefully and recover quickly. Test your error handling regularly and monitor your applications to identify and address issues before they impact users.