Deploying Claude in production requires comprehensive monitoring, cost optimization, security hardening, and scaling strategies. This final part covers production-ready patterns for Azure AI Foundry with Claude, including Application Insights integration, prompt caching optimization, security best practices, and architectural patterns for high-availability systems.
Architecture Patterns
graph TB
Client[Client Application]
APIM[Azure API Management]
AppService[Azure App Service]
KeyVault[Azure Key Vault]
Foundry[Azure AI Foundry]
AppInsights[Application Insights]
Cache[Azure Cache for Redis]
Client -->|HTTPS| APIM
APIM -->|Rate Limit & Auth| AppService
AppService -->|Get Credentials| KeyVault
AppService -->|Check Cache| Cache
AppService -->|API Calls| Foundry
AppService -->|Telemetry| AppInsights
Foundry -->|Claude Models| AppService
style Foundry fill:#0078d4
style AppInsights fill:#68217a
style KeyVault fill:#0078d4Application Insights Integration
Node.js Implementation
import * as appInsights from 'applicationinsights';
import { AnthropicFoundry } from '@anthropic-ai/sdk/foundry';
// Initialize Application Insights
appInsights.setup(process.env.APPLICATIONINSIGHTS_CONNECTION_STRING)
.setAutoCollectRequests(true)
.setAutoCollectPerformance(true)
.setAutoCollectExceptions(true)
.setAutoCollectDependencies(true)
.start();
const client = appInsights.defaultClient;
class MonitoredClaudeClient {
private client: AnthropicFoundry;
constructor() {
this.client = new AnthropicFoundry({
credential: new DefaultAzureCredential(),
resourceName: process.env.AZURE_FOUNDRY_RESOURCE!
});
}
async chat(message: string, options: ChatOptions = {}): Promise {
const startTime = Date.now();
try {
const response = await this.client.messages.create({
model: options.model || 'claude-sonnet-4-5',
max_tokens: options.maxTokens || 2048,
messages: [{ role: 'user', content: message }]
});
const duration = Date.now() - startTime;
// Track success metrics
client.trackMetric({
name: 'Claude_API_Duration',
value: duration
});
client.trackMetric({
name: 'Claude_Input_Tokens',
value: response.usage.input_tokens
});
client.trackMetric({
name: 'Claude_Output_Tokens',
value: response.usage.output_tokens
});
client.trackEvent({
name: 'Claude_API_Success',
properties: {
model: options.model || 'claude-sonnet-4-5',
duration: duration.toString()
}
});
return response.content[0].text;
} catch (error) {
// Track failures
client.trackException({ exception: error as Error });
client.trackEvent({
name: 'Claude_API_Failure',
properties: {
error: (error as Error).message,
model: options.model || 'claude-sonnet-4-5'
}
});
throw error;
}
}
} Python Implementation
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace.samplers import ProbabilitySampler
from opencensus.trace.tracer import Tracer
import logging
import time
# Configure logging
logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(
connection_string=os.environ['APPLICATIONINSIGHTS_CONNECTION_STRING']
))
# Configure tracing
tracer = Tracer(
exporter=AzureExporter(
connection_string=os.environ['APPLICATIONINSIGHTS_CONNECTION_STRING']
),
sampler=ProbabilitySampler(1.0)
)
class MonitoredClaudeClient:
def __init__(self):
self.client = AsyncAnthropicFoundry(
credential=DefaultAzureCredential(),
resource_name=os.environ['AZURE_FOUNDRY_RESOURCE']
)
async def chat(self, message: str, model: str = 'claude-sonnet-4-5') -> str:
with tracer.span(name='Claude_API_Call') as span:
start_time = time.time()
try:
response = await self.client.messages.create(
model=model,
max_tokens=2048,
messages=[{'role': 'user', 'content': message}]
)
duration = time.time() - start_time
# Log metrics
logger.info('Claude API Success', extra={
'custom_dimensions': {
'model': model,
'duration': duration,
'input_tokens': response.usage.input_tokens,
'output_tokens': response.usage.output_tokens
}
})
span.add_attribute('model', model)
span.add_attribute('duration', duration)
return response.content[0].text
except Exception as e:
logger.exception('Claude API Failure', extra={
'custom_dimensions': {
'model': model,
'error': str(e)
}
})
raiseCost Optimization with Prompt Caching
interface CacheConfig {
type: 'ephemeral' | 'extended';
minTokens: number;
}
class CostOptimizedClient {
private client: AnthropicFoundry;
private cacheConfig: CacheConfig;
constructor(cacheConfig: CacheConfig = { type: 'ephemeral', minTokens: 1024 }) {
this.client = new AnthropicFoundry({
credential: new DefaultAzureCredential(),
resourceName: process.env.AZURE_FOUNDRY_RESOURCE!
});
this.cacheConfig = cacheConfig;
}
async chatWithContext(
systemPrompt: string,
userMessage: string
): Promise {
// Calculate if caching is beneficial
const systemTokens = this.estimateTokens(systemPrompt);
const response = await this.client.messages.create({
model: 'claude-sonnet-4-5',
max_tokens: 2048,
system: systemTokens >= this.cacheConfig.minTokens
? [{
type: 'text',
text: systemPrompt,
cache_control: { type: this.cacheConfig.type }
}]
: systemPrompt,
messages: [{ role: 'user', content: userMessage }]
});
// Log cache performance
if (response.usage.cache_read_input_tokens) {
console.log(`Cache hit! Saved ${response.usage.cache_read_input_tokens} tokens`);
const savings = this.calculateSavings(response.usage);
console.log(`Cost savings: $${savings.toFixed(4)}`);
}
return response.content[0].text;
}
private estimateTokens(text: string): number {
return Math.ceil(text.length / 4);
}
private calculateSavings(usage: any): number {
const inputPrice = 3.00 / 1_000_000; // Sonnet 4.5 input
const cacheReadPrice = 0.30 / 1_000_000; // 90% discount
const cachedTokens = usage.cache_read_input_tokens || 0;
const fullCost = cachedTokens * inputPrice;
const cachedCost = cachedTokens * cacheReadPrice;
return fullCost - cachedCost;
}
} Security Best Practices
Key Vault Integration
import { SecretClient } from '@azure/keyvault-secrets';
import { DefaultAzureCredential } from '@azure/identity';
class SecureClaudeClient {
private client: AnthropicFoundry | null = null;
private credential: DefaultAzureCredential;
private keyVaultUrl: string;
constructor(keyVaultUrl: string) {
this.credential = new DefaultAzureCredential();
this.keyVaultUrl = keyVaultUrl;
}
async initialize(): Promise {
const secretClient = new SecretClient(
this.keyVaultUrl,
this.credential
);
// Retrieve API key from Key Vault
const secret = await secretClient.getSecret('foundry-api-key');
this.client = new AnthropicFoundry({
credential: this.credential,
resourceName: process.env.AZURE_FOUNDRY_RESOURCE!
});
}
async chat(message: string): Promise {
if (!this.client) {
throw new Error('Client not initialized');
}
const response = await this.client.messages.create({
model: 'claude-sonnet-4-5',
max_tokens: 2048,
messages: [{ role: 'user', content: message }]
});
return response.content[0].text;
}
} Rate Limiting
import Bottleneck from 'bottleneck';
class RateLimitedClient {
private client: AnthropicFoundry;
private limiter: Bottleneck;
constructor() {
this.client = new AnthropicFoundry({
credential: new DefaultAzureCredential(),
resourceName: process.env.AZURE_FOUNDRY_RESOURCE!
});
// Configure rate limiter (80,000 TPM / 800 RPM for Sonnet 4.5)
this.limiter = new Bottleneck({
reservoir: 800, // Max requests
reservoirRefreshAmount: 800,
reservoirRefreshInterval: 60 * 1000, // Per minute
maxConcurrent: 10 // Concurrent requests
});
}
async chat(message: string): Promise {
return this.limiter.schedule(async () => {
const response = await this.client.messages.create({
model: 'claude-sonnet-4-5',
max_tokens: 2048,
messages: [{ role: 'user', content: message }]
});
return response.content[0].text;
});
}
} High Availability Pattern
class HighAvailabilityClient {
private clients: AnthropicFoundry[];
private currentIndex: number = 0;
constructor(resources: string[]) {
this.clients = resources.map(resource =>
new AnthropicFoundry({
credential: new DefaultAzureCredential(),
resourceName: resource
})
);
}
async chat(message: string, maxRetries: number = 3): Promise {
let lastError: Error | null = null;
for (let attempt = 0; attempt < maxRetries; attempt++) {
const client = this.getNextClient();
try {
const response = await client.messages.create({
model: 'claude-sonnet-4-5',
max_tokens: 2048,
messages: [{ role: 'user', content: message }]
});
return response.content[0].text;
} catch (error) {
lastError = error as Error;
console.warn(`Attempt ${attempt + 1} failed, trying next resource`);
await this.delay(Math.pow(2, attempt) * 1000);
}
}
throw new Error(`All retry attempts failed: ${lastError?.message}`);
}
private getNextClient(): AnthropicFoundry {
const client = this.clients[this.currentIndex];
this.currentIndex = (this.currentIndex + 1) % this.clients.length;
return client;
}
private delay(ms: number): Promise {
return new Promise(resolve => setTimeout(resolve, ms));
}
} Production Checklist
- Monitoring: Application Insights telemetry, custom metrics, alerting
- Security: Key Vault for secrets, Entra ID authentication, network isolation
- Cost Optimization: Prompt caching (ephemeral/extended), model selection strategy
- Reliability: Retry logic with exponential backoff, circuit breakers, health checks
- Rate Limiting: Client-side throttling, quota monitoring, burst handling
- High Availability: Multi-region deployment, failover strategy, load balancing
- Compliance: Content filtering, audit logging, data residency requirements
Conclusion
This seven-part series provided comprehensive coverage of Azure AI Foundry with Claude integration, from strategic overview through production deployment. You now have production-ready patterns across Node.js, Python, and C#, with DevOps automation and enterprise-grade monitoring, security, and optimization strategies.
