Welcome back to our deep dive into Azure Event Grid resilience patterns. In Part 1, we explored the foundational concepts and strategic approaches to dead letter handling. Now, let’s get our hands dirty with practical implementation patterns, automation strategies, and real-world configuration examples.
Infrastructure as Code: Configuring Retry Policies
Let’s start with concrete examples of how to configure retry policies and dead letter destinations using ARM templates and Terraform.
ARM Template Configuration
{
"type": "Microsoft.EventGrid/eventSubscriptions",
"apiVersion": "2022-06-15",
"name": "[parameters('subscriptionName')]",
"properties": {
"destination": {
"endpointType": "WebHook",
"properties": {
"endpointUrl": "[parameters('webhookUrl')]",
"maxEventsPerBatch": 1,
"preferredBatchSizeInKilobytes": 64
}
},
"retryPolicy": {
"maxDeliveryAttempts": 30,
"eventTimeToLiveInMinutes": 1440
},
"deadLetterDestination": {
"endpointType": "StorageBlob",
"properties": {
"resourceId": "[parameters('storageAccountId')]",
"blobContainerName": "deadletter-events"
}
},
"eventDeliverySchema": "EventGridSchema",
"filter": {
"includedEventTypes": [
"Microsoft.Storage.BlobCreated",
"CustomApp.OrderProcessed"
]
}
}
}
Terraform Configuration
resource "azurerm_eventgrid_event_subscription" "critical_events" {
name = "critical-events-subscription"
scope = azurerm_storage_account.source.id
webhook_endpoint {
url = var.webhook_url
max_events_per_batch = 1
preferred_batch_size_in_kilobytes = 64
}
retry_policy {
max_delivery_attempts = 50
event_time_to_live = 2880 # 48 hours for critical events
}
storage_blob_dead_letter_destination {
storage_account_id = azurerm_storage_account.deadletter.id
storage_blob_container_name = "critical-deadletter"
}
included_event_types = [
"Microsoft.Storage.BlobCreated",
"CustomApp.PaymentProcessed"
]
advanced_filter {
string_contains {
key = "data.api"
values = ["payment", "billing"]
}
}
}
# Separate subscription for non-critical events
resource "azurerm_eventgrid_event_subscription" "standard_events" {
name = "standard-events-subscription"
scope = azurerm_storage_account.source.id
webhook_endpoint {
url = var.standard_webhook_url
}
retry_policy {
max_delivery_attempts = 10
event_time_to_live = 360 # 6 hours
}
service_bus_queue_endpoint_id = azurerm_servicebus_queue.deadletter.id
included_event_types = [
"CustomApp.UserActivity",
"CustomApp.NotificationSent"
]
}
Dead Letter Processing Automation
Once events land in dead letter destinations, you need robust automation to process them. Here are several proven patterns:
Pattern 1: Azure Function Dead Letter Processor
graph TD A[Dead Letter Storage] --> B[Blob Trigger Function] B --> C{Event Analysis} C -->|Transient Error| D[Republish to Event Grid] C -->|Malformed Event| E[Transform & Republish] C -->|Permanent Failure| F[Alert & Log] C -->|Unknown| G[Manual Review Queue] D --> H[Success Metrics] E --> H F --> I[Error Dashboard] G --> J[Operations Team] style D fill:#90EE90 style E fill:#87CEEB style F fill:#FFB6C1 style G fill:#DDA0DD
[FunctionName("ProcessDeadLetterEvents")]
public static async Task Run(
[BlobTrigger("deadletter-events/{name}", Connection = "DeadLetterStorage")] Stream blobStream,
string name,
[EventGrid(TopicEndpointUri = "EventGridEndpoint", TopicAccessKey = "EventGridKey")] IAsyncCollector<EventGridEvent> eventCollector,
ILogger log)
{
using var reader = new StreamReader(blobStream);
var eventData = await reader.ReadToEndAsync();
var deadLetterEvent = JsonConvert.DeserializeObject<EventGridEvent>(eventData);
// Analyze failure reason
var failureReason = await AnalyzeFailureReason(deadLetterEvent);
switch (failureReason.Category)
{
case FailureCategory.Transient:
await RetryEvent(deadLetterEvent, eventCollector, log);
break;
case FailureCategory.MalformedData:
var transformedEvent = await TransformEvent(deadLetterEvent);
await eventCollector.AddAsync(transformedEvent);
break;
case FailureCategory.Permanent:
await LogPermanentFailure(deadLetterEvent, failureReason);
break;
case FailureCategory.Unknown:
await QueueForManualReview(deadLetterEvent);
break;
}
}
private static async Task<FailureAnalysisResult> AnalyzeFailureReason(EventGridEvent deadLetterEvent)
{
// Extract delivery attempt information
var deliveryCount = deadLetterEvent.Data.Value<int>("deliveryCount");
var lastDeliveryResult = deadLetterEvent.Data.Value<string>("lastDeliveryResult");
var lastHttpStatusCode = deadLetterEvent.Data.Value<int>("lastHttpStatusCode");
// Categorize based on status codes and patterns
if (lastHttpStatusCode >= 500 && lastHttpStatusCode < 600)
return new FailureAnalysisResult { Category = FailureCategory.Transient };
if (lastHttpStatusCode == 400 || lastHttpStatusCode == 422)
return new FailureAnalysisResult { Category = FailureCategory.MalformedData };
if (lastHttpStatusCode == 404 || lastHttpStatusCode == 401)
return new FailureAnalysisResult { Category = FailureCategory.Permanent };
return new FailureAnalysisResult { Category = FailureCategory.Unknown };
}
Pattern 2: Service Bus Dead Letter Processing
For high-volume scenarios, Service Bus provides more sophisticated dead letter handling:
[FunctionName("ProcessServiceBusDeadLetter")]
public static async Task Run(
[ServiceBusTrigger("deadletter", Connection = "ServiceBusConnection", IsSessionsEnabled = false)]
ServiceBusReceivedMessage message,
ServiceBusMessageActions messageActions,
ILogger log)
{
try
{
// Extract original event from dead letter message
var originalEvent = JsonConvert.DeserializeObject<EventGridEvent>(message.Body.ToString());
// Implement exponential backoff for retry
var retryCount = message.ApplicationProperties.ContainsKey("RetryCount")
? (int)message.ApplicationProperties["RetryCount"] : 0;
if (retryCount < MAX_MANUAL_RETRIES)
{
await Task.Delay(TimeSpan.FromSeconds(Math.Pow(2, retryCount)));
// Clone message with updated retry count
var retryMessage = new ServiceBusMessage(message.Body)
{
ApplicationProperties = { ["RetryCount"] = retryCount + 1 }
};
await messageActions.DeadLetterMessageAsync(message, "retry-attempted",
$"Manual retry attempt {retryCount + 1}");
}
else
{
// Send to permanent failure storage
await StorePermanentFailure(originalEvent);
await messageActions.CompleteMessageAsync(message);
}
}
catch (Exception ex)
{
log.LogError(ex, "Failed to process dead letter message");
await messageActions.AbandonMessageAsync(message);
}
}
Advanced Configuration Patterns
Event Type-Specific Retry Policies
Different event types often require different resilience strategies. Here’s how to implement event-specific policies:
graph TB subgraph "Event Classification" A[Incoming Event] --> B{Event Type Router} B -->|Payment Events| C[Critical Queue50 retries, 48h TTL] B -->|User Actions| D[Standard Queue10 retries, 6h TTL] B -->|Analytics| E[Best Effort Queue3 retries, 1h TTL] B -->|System Events| F[Infrastructure Queue20 retries, 12h TTL] end subgraph "Dead Letter Destinations" C --> G[Premium StorageImmediate Alerts] D --> H[Standard Service BusHourly Processing] E --> I[Blob StorageDaily Batch Review] F --> J[Operations QueueReal-time Monitoring] end style C fill:#FF6B6B style D fill:#4ECDC4 style E fill:#45B7D1 style F fill:#96CEB4
Dynamic Retry Configuration
Implement dynamic retry policies that adapt based on downstream service health:
public class DynamicRetryPolicyManager
{
private readonly IServiceHealthMonitor _healthMonitor;
private readonly IEventGridManagementClient _eventGridClient;
public async Task AdjustRetryPolicies()
{
var services = await _healthMonitor.GetServiceHealthStatus();
foreach (var service in services)
{
var subscription = await _eventGridClient.EventSubscriptions
.GetAsync(service.SubscriptionResourceGroup, service.SubscriptionName);
var newPolicy = CalculateOptimalRetryPolicy(service);
if (ShouldUpdatePolicy(subscription.RetryPolicy, newPolicy))
{
subscription.RetryPolicy = newPolicy;
await _eventGridClient.EventSubscriptions.UpdateAsync(
service.SubscriptionResourceGroup,
service.SubscriptionName,
subscription);
_logger.LogInformation($"Updated retry policy for {service.Name}: " +
$"MaxAttempts={newPolicy.MaxDeliveryAttempts}, TTL={newPolicy.EventTimeToLiveInMinutes}");
}
}
}
private EventSubscriptionRetryPolicy CalculateOptimalRetryPolicy(ServiceHealthStatus service)
{
// Reduce retries for unhealthy services
var baseAttempts = service.HealthScore > 0.8 ? 30 : 10;
var adjustedAttempts = (int)(baseAttempts * service.HealthScore);
return new EventSubscriptionRetryPolicy
{
MaxDeliveryAttempts = Math.Max(1, adjustedAttempts),
EventTimeToLiveInMinutes = service.IsHealthy ? 1440 : 360 // 24h vs 6h
};
}
}
Event Enrichment and Transformation Patterns
Sometimes dead lettered events can be salvaged through transformation. Here’s a comprehensive approach:
Schema Evolution Handler
public class EventSchemaTransformer
{
private readonly Dictionary<string, IEventTransformer> _transformers;
public EventSchemaTransformer()
{
_transformers = new Dictionary<string, IEventTransformer>
{
["CustomApp.OrderCreated.v1"] = new OrderEventV1ToV2Transformer(),
["CustomApp.UserRegistered.v1"] = new UserEventV1ToV2Transformer(),
["Microsoft.Storage.BlobCreated"] = new BlobEventEnricher()
};
}
public async Task<EventGridEvent> TransformEvent(EventGridEvent deadLetterEvent)
{
var originalEvent = ExtractOriginalEvent(deadLetterEvent);
if (_transformers.TryGetValue(originalEvent.EventType, out var transformer))
{
var transformedEvent = await transformer.TransformAsync(originalEvent);
// Add transformation metadata
transformedEvent.Data = AddTransformationMetadata(transformedEvent.Data, new
{
TransformedAt = DateTime.UtcNow,
OriginalVersion = originalEvent.EventType,
TransformationReason = "dead_letter_recovery",
TransformerId = transformer.GetType().Name
});
return transformedEvent;
}
throw new UnsupportedEventTypeException($"No transformer available for {originalEvent.EventType}");
}
}
public class OrderEventV1ToV2Transformer : IEventTransformer
{
public async Task<EventGridEvent> TransformAsync(EventGridEvent originalEvent)
{
var v1Data = originalEvent.Data.ToObjectFromJson<OrderEventV1>();
var v2Data = new OrderEventV2
{
OrderId = v1Data.OrderId,
CustomerId = v1Data.CustomerId,
// New required fields with defaults
Currency = v1Data.Currency ?? "USD",
TaxAmount = v1Data.TaxAmount ?? 0,
ShippingAddress = v1Data.ShippingAddress ?? new Address { Country = "US" },
// Handle breaking changes
OrderItems = v1Data.Items?.Select(item => new OrderItemV2
{
ProductId = item.ProductId,
Quantity = item.Quantity,
UnitPrice = item.Price, // Field renamed
TaxRate = 0.0m // New field with default
}).ToList() ?? new List<OrderItemV2>()
};
return new EventGridEvent
{
Id = Guid.NewGuid().ToString(),
EventType = "CustomApp.OrderCreated.v2",
EventTime = DateTime.UtcNow,
Subject = originalEvent.Subject,
Data = BinaryData.FromObjectAsJson(v2Data)
};
}
}
Cost Optimization Strategies
Dead letter handling can become expensive at scale. Here are proven cost optimization techniques:
Intelligent Batching
graph TD A[Dead Letter Events] --> B[Batch Aggregator] B --> C{Batch Size Check} C -->|< 100 events| D[Wait for more events] C -->|≥ 100 events| E[Process Batch] C -->|Timeout reached| E D --> F[Timer: 5 minutes] F --> C E --> G[Bulk Analysis] G --> H[Pattern Detection] H --> I[Batch Transformation] I --> J[Bulk Republish] subgraph "Cost Savings" K[Function Executions: -80%] L[Processing Time: -60%] M[Storage Operations: -70%] end style E fill:#90EE90 style K fill:#87CEEB style L fill:#87CEEB style M fill:#87CEEB
Tiered Storage Strategy
# PowerShell script for automated blob lifecycle management
$resourceGroupName = "rg-eventgrid-prod"
$storageAccountName = "deadletterstorageacct"
# Create lifecycle policy for dead letter blobs
$rules = @()
# Rule 1: Move recent dead letters to cool tier after 30 days
$rules += New-AzStorageAccountManagementPolicyRule `
-Name "DeadLetterCoolTier" `
-BlobType blockBlob `
-PrefixMatch @("deadletter-events/") `
-TierToCool 30
# Rule 2: Move old dead letters to archive after 90 days
$rules += New-AzStorageAccountManagementPolicyRule `
-Name "DeadLetterArchiveTier" `
-BlobType blockBlob `
-PrefixMatch @("deadletter-events/") `
-TierToArchive 90
# Rule 3: Delete very old dead letters after 2 years
$rules += New-AzStorageAccountManagementPolicyRule `
-Name "DeadLetterDelete" `
-BlobType blockBlob `
-PrefixMatch @("deadletter-events/") `
-DeleteAfterDaysFromCreation 730
$policy = New-AzStorageAccountManagementPolicyRuleObject -Rule $rules
Set-AzStorageAccountManagementPolicy `
-ResourceGroupName $resourceGroupName `
-AccountName $storageAccountName `
-Policy $policy
Write-Host "Dead letter storage lifecycle policy configured successfully"
Testing Dead Letter Scenarios
Proper testing is crucial for validating your dead letter handling. Here’s a comprehensive testing framework:
Chaos Engineering for Event Grid
public class EventGridChaosExperiment
{
private readonly IEventGridPublisherClient _eventGridClient;
private readonly IChaosConfiguration _config;
[Test]
public async Task SimulateDownstreamServiceFailure()
{
// Arrange: Deploy test webhook that fails with specific patterns
var testWebhook = await DeployFaultyWebhook(new WebhookFailureConfig
{
FailureRate = 0.7, // 70% failure rate
FailureTypes = new[] { HttpStatusCode.ServiceUnavailable, HttpStatusCode.RequestTimeout },
FailureDuration = TimeSpan.FromMinutes(15)
});
// Act: Send test events
var testEvents = GenerateTestEvents(100);
await _eventGridClient.SendEventsAsync(testEvents);
// Wait for retry period to complete
await Task.Delay(TimeSpan.FromMinutes(20));
// Assert: Verify dead letter handling
var deadLetterEvents = await GetDeadLetterEvents();
var reprocessedEvents = await GetReprocessedEvents();
Assert.That(deadLetterEvents.Count, Is.EqualTo(70).Within(5)); // ~70% should be dead lettered
Assert.That(reprocessedEvents.Count, Is.GreaterThan(60)); // Most should be automatically reprocessed
// Verify SLA metrics
var avgTimeToDeadLetter = CalculateAverageTimeToDeadLetter(deadLetterEvents);
Assert.That(avgTimeToDeadLetter, Is.LessThan(TimeSpan.FromMinutes(30)));
}
[Test]
public async Task ValidateRetryExponentialBackoff()
{
var deliveryAttempts = new List<DateTime>();
// Setup webhook that records all delivery attempts
var recordingWebhook = await DeployRecordingWebhook(deliveryAttempts);
// Send single test event to consistently failing endpoint
await _eventGridClient.SendEventAsync(CreateTestEvent());
// Wait for all retry attempts
await Task.Delay(TimeSpan.FromHours(2));
// Verify exponential backoff pattern
Assert.That(deliveryAttempts.Count, Is.EqualTo(30)); // Max retry attempts
for (int i = 1; i < deliveryAttempts.Count; i++)
{
var timeBetweenAttempts = deliveryAttempts[i] - deliveryAttempts[i - 1];
var expectedMinInterval = TimeSpan.FromSeconds(10 * Math.Pow(2, Math.Min(i - 1, 6)));
Assert.That(timeBetweenAttempts, Is.GreaterThanOrEqualTo(expectedMinInterval));
}
}
}
Production Monitoring Dashboard
Create comprehensive monitoring for your dead letter systems:
graph TB subgraph "Real-time Metrics" A[Event Grid Delivery Rate] --> D[Azure Monitor] B[Dead Letter Queue Depth] --> D C[Retry Success Rate] --> D end subgraph "Alerting Rules" D --> E{Threshold Checks} E -->|Dead Letter > 100| F[PagerDuty Alert] E -->|Success Rate < 95%| G[Teams Alert] E -->|Retry Spike| H[Email Alert] end subgraph "Dashboards" I[Executive DashboardSLA Metrics] J[Operations DashboardReal-time Status] K[Engineering DashboardTechnical Metrics] end D --> I D --> J D --> K style F fill:#FF6B6B style G fill:#FFA500 style H fill:#FFD700
Coming Up in Part 3
In the final part of this series, we’ll cover:
- Advanced multi-region dead letter strategies
- Event Grid custom topics for complex routing scenarios
- Integration with Azure Service Bus for enterprise messaging patterns
- Performance optimization for high-throughput scenarios
- Compliance and audit considerations for dead letter handling
- Real-world case studies and lessons learned
We’ll also provide a complete reference architecture with all the code samples organized into a production-ready solution.
Found this implementation guide helpful? Share your own dead letter handling patterns in the comments, or let me know what specific scenarios you’d like covered in Part 3!
One thought on “Mastering Dead Letter Handling and Retry Policies in Azure Event Grid – Part 2: Implementation and Automation”
Comments are closed.