Deployment & Operations
Deploy, monitor, and maintain services in production
Learn how to deploy services to production and operate them reliably at scale.
Pre-Deployment Checklist
Before deploying any service, ensure:
const deploymentChecklist = {
// Code quality
code: {
testsPass: true, // All tests green
linted: true, // No linting errors
typeChecked: true, // Type safety verified
reviewed: true, // Code review complete
},
// Service definition
service: {
documented: true, // API docs complete
versioned: true, // Semantic version assigned
pricing: true, // Pricing model defined
sla: true, // SLA commitments set
},
// Infrastructure
infrastructure: {
resourcesProvisioned: true, // CPU/memory allocated
secretsConfigured: true, // API keys, credentials set
monitoringSetup: true, // Metrics and alerts configured
backupsEnabled: true, // Data backup strategy in place
},
// Security
security: {
authenticationEnabled: true, // Auth required
inputValidation: true, // Input sanitization
rateLimiting: true, // Rate limits configured
encryptionEnabled: true, // Data encryption
},
// Business
business: {
termsOfService: true, // Legal terms defined
privacyPolicy: true, // Privacy compliance
pricingApproved: true, // Pricing strategy approved
supportPlan: true, // Customer support plan
},
}
// Validate checklist
async function validateDeployment(service: Service) {
const results = []
for (const [category, checks] of Object.entries(deploymentChecklist)) {
for (const [check, required] of Object.entries(checks)) {
const passed = await validateCheck(service, category, check)
if (required && !passed) {
results.push({
category,
check,
status: 'failed',
blocking: true,
})
}
}
}
return {
ready: results.every((r) => r.status !== 'failed'),
issues: results.filter((r) => r.status === 'failed'),
}
}Deployment Process
1. Staging Deployment
Test in production-like environment:
import $, { db } from 'sdk.do'
// Deploy to staging
async function deployToStaging(service: Service) {
// Create staging instance
const staging = await $.Service.deploy({
serviceId: service.id,
environment: 'staging',
configuration: {
maxConcurrency: 10,
timeout: 60000,
resources: {
cpu: '1000m',
memory: '512Mi',
},
},
})
// Run smoke tests
const smokeTests = await runSmokeTests(staging)
if (!smokeTests.passed) {
throw new Error(`Smoke tests failed: ${smokeTests.failures}`)
}
// Run integration tests
const integrationTests = await runIntegrationTests(staging)
if (!integrationTests.passed) {
throw new Error(`Integration tests failed: ${integrationTests.failures}`)
}
// Load test
const loadTest = await runLoadTest(staging, {
concurrency: 50,
duration: 300, // 5 minutes
targetRPS: 10,
})
if (loadTest.errorRate > 0.01) {
throw new Error(`Load test error rate too high: ${loadTest.errorRate}`)
}
return {
staging,
tests: { smokeTests, integrationTests, loadTest },
}
}2. Production Deployment
Deploy with zero downtime:
// Blue-green deployment
async function deployToProduction(service: Service) {
// Deploy new version (green)
const green = await $.Service.deploy({
serviceId: service.id,
environment: 'production',
version: service.version,
configuration: {
maxConcurrency: 100,
timeout: 60000,
resources: {
cpu: '4000m',
memory: '2Gi',
},
replicas: 3,
autoScaling: {
minReplicas: 3,
maxReplicas: 10,
targetCPU: 70,
},
},
})
// Health check
await waitForHealthy(green, { timeout: 300000 })
// Canary deployment: 10% traffic
await $.Service.route({
serviceId: service.id,
versions: [
{ version: service.version - 1, weight: 90 }, // Old (blue)
{ version: service.version, weight: 10 }, // New (green)
],
})
// Monitor canary
await sleep(600000) // 10 minutes
const canaryMetrics = await getMetrics(green, { period: '10m' })
if (canaryMetrics.errorRate > 0.01) {
// Rollback
await $.Service.route({
serviceId: service.id,
versions: [{ version: service.version - 1, weight: 100 }],
})
throw new Error('Canary deployment failed, rolled back')
}
// Gradual rollout: 50% traffic
await $.Service.route({
serviceId: service.id,
versions: [
{ version: service.version - 1, weight: 50 },
{ version: service.version, weight: 50 },
],
})
await sleep(600000) // 10 minutes
// Full rollout: 100% traffic
await $.Service.route({
serviceId: service.id,
versions: [{ version: service.version, weight: 100 }],
})
// Cleanup old version
await sleep(3600000) // Wait 1 hour
await $.Service.undeploy({
serviceId: service.id,
version: service.version - 1,
})
return green
}3. Rollback Strategy
Quick rollback if issues occur:
// Automatic rollback on errors
on.Service.errorRate, async (event) => {
if (event.rate > 0.05) {
// 5% error rate threshold
const service = await db.findOne($.Service, { id: event.serviceId })
// Get previous stable version
const previousVersion = await db.findOne($.ServiceDeployment, {
serviceId: service.id,
status: 'stable',
version: { lt: service.version },
})
if (previousVersion) {
// Rollback
await $.Service.rollback({
serviceId: service.id,
toVersion: previousVersion.version,
})
// Alert team
send.Alert.create, {
severity: 'critical',
title: 'Service rolled back due to high error rate',
service: service.name,
errorRate: event.rate,
rolledBackTo: previousVersion.version,
})
}
}
})Monitoring & Observability
Metrics Collection
Track key performance indicators:
import { db, on } from 'sdk.do'
// Service metrics
const serviceMetrics = {
// Performance
responseTime: 'histogram',
throughput: 'counter',
errorRate: 'gauge',
successRate: 'gauge',
// Resources
cpuUsage: 'gauge',
memoryUsage: 'gauge',
activeRequests: 'gauge',
// Business
revenue: 'counter',
customers: 'gauge',
conversionRate: 'gauge',
}
// Collect metrics on every request
on.ServiceRequest.created, async (request) => {
const start = Date.now()
try {
// Execute service
const result = await executeService(request)
// Record success metrics
const duration = Date.now() - start
await recordMetric('service.response_time', duration, {
serviceId: request.serviceId,
status: 'success',
})
await recordMetric('service.throughput', 1, {
serviceId: request.serviceId,
})
await recordMetric('service.revenue', result.cost, {
serviceId: request.serviceId,
})
} catch (error) {
// Record error metrics
await recordMetric('service.errors', 1, {
serviceId: request.serviceId,
errorType: error.type,
})
await recordMetric('service.error_rate', 1, {
serviceId: request.serviceId,
})
}
})
// Calculate aggregate metrics periodically
setInterval(async () => {
const services = await db.query($.Service, {
where: { status: 'deployed' },
})
for (const service of services) {
// Calculate success rate
const metrics = await getMetrics(service.id, { period: '5m' })
const successRate = metrics.throughput > 0 ? 1 - metrics.errors / metrics.throughput : 1
await recordMetric('service.success_rate', successRate, {
serviceId: service.id,
})
// Update service health
await db.update(service, {
health: {
successRate,
averageResponseTime: metrics.responseTime.p95,
errorRate: metrics.errors / metrics.throughput,
lastUpdated: new Date(),
},
})
}
}, 60000) // Every minuteLogging
Structured logging for debugging:
import { logger } from 'sdk.do'
on.ServiceRequest.created, async (request) => {
// Request logging
logger.info('service.request.start', {
requestId: request.id,
serviceId: request.serviceId,
customerId: request.customerId,
inputs: sanitizeInputs(request.inputs),
})
try {
const result = await executeService(request)
// Success logging
logger.info('service.request.complete', {
requestId: request.id,
duration: result.duration,
cost: result.cost,
outputSize: JSON.stringify(result.outputs).length,
})
} catch (error) {
// Error logging
logger.error('service.request.failed', {
requestId: request.id,
error: error.message,
stack: error.stack,
inputs: sanitizeInputs(request.inputs),
})
}
})
// Performance logging
async function executeWithLogging(fn, context) {
const start = Date.now()
logger.debug('execution.start', context)
try {
const result = await fn()
logger.debug('execution.complete', {
...context,
duration: Date.now() - start,
})
return result
} catch (error) {
logger.error('execution.failed', {
...context,
duration: Date.now() - start,
error: error.message,
})
throw error
}
}Alerting
Proactive monitoring with alerts:
// Alert rules
const alertRules = [
{
name: 'High Error Rate',
condition: 'service.error_rate > 0.05',
duration: '5m',
severity: 'critical',
action: async (service) => {
send.Alert.create, {
title: `High error rate on ${service.name}`,
description: `Error rate: ${service.health.errorRate}`,
severity: 'critical',
service: service.id,
})
// Auto-scale to handle load
await $.Service.scale({
serviceId: service.id,
replicas: { add: 2 },
})
},
},
{
name: 'Slow Response Time',
condition: 'service.response_time.p95 > 30000',
duration: '10m',
severity: 'warning',
action: async (service) => {
send.Alert.create, {
title: `Slow response time on ${service.name}`,
description: `P95 response time: ${service.health.averageResponseTime}ms`,
severity: 'warning',
service: service.id,
})
},
},
{
name: 'High Memory Usage',
condition: 'service.memory_usage > 0.85',
duration: '5m',
severity: 'warning',
action: async (service) => {
send.Alert.create, {
title: `High memory usage on ${service.name}`,
severity: 'warning',
service: service.id,
})
// Restart service to clear memory
await $.Service.restart({
serviceId: service.id,
graceful: true,
})
},
},
]
// Evaluate alert rules
setInterval(async () => {
const services = await db.query($.Service, {
where: { status: 'deployed' },
})
for (const service of services) {
for (const rule of alertRules) {
const triggered = evaluateCondition(rule.condition, service)
if (triggered) {
// Check if already alerted recently
const recentAlert = await db.findOne($.Alert, {
serviceId: service.id,
rule: rule.name,
createdAt: { gte: new Date(Date.now() - rule.duration * 60000) },
})
if (!recentAlert) {
await rule.action(service)
}
}
}
}
}, 60000) // Check every minuteScaling
Auto-Scaling
Automatically scale based on load:
// Auto-scaling configuration
const autoScaling = {
minReplicas: 2,
maxReplicas: 20,
metrics: {
cpu: {
target: 70, // 70% CPU utilization
scaleUp: 80,
scaleDown: 30,
},
memory: {
target: 75,
scaleUp: 85,
scaleDown: 40,
},
requestRate: {
target: 100, // requests per second per replica
scaleUp: 150,
scaleDown: 50,
},
},
cooldown: {
scaleUp: 60, // Wait 60s between scale ups
scaleDown: 300, // Wait 5min between scale downs
},
}
// Auto-scaling logic
setInterval(async () => {
const services = await db.query($.Service, {
where: { autoScaling: { enabled: true } },
})
for (const service of services) {
const metrics = await getMetrics(service.id, { period: '5m' })
const currentReplicas = service.deployment.replicas
// Check if should scale up
if (
metrics.cpu > autoScaling.metrics.cpu.scaleUp ||
metrics.memory > autoScaling.metrics.memory.scaleUp ||
metrics.requestRate > autoScaling.metrics.requestRate.scaleUp
) {
const newReplicas = Math.min(currentReplicas + 1, autoScaling.maxReplicas)
if (newReplicas > currentReplicas) {
await $.Service.scale({
serviceId: service.id,
replicas: newReplicas,
})
logger.info('service.scaled.up', {
serviceId: service.id,
from: currentReplicas,
to: newReplicas,
reason: { cpu: metrics.cpu, memory: metrics.memory },
})
}
}
// Check if should scale down
else if (
metrics.cpu < autoScaling.metrics.cpu.scaleDown &&
metrics.memory < autoScaling.metrics.memory.scaleDown &&
metrics.requestRate < autoScaling.metrics.requestRate.scaleDown
) {
const newReplicas = Math.max(currentReplicas - 1, autoScaling.minReplicas)
if (newReplicas < currentReplicas) {
await $.Service.scale({
serviceId: service.id,
replicas: newReplicas,
})
logger.info('service.scaled.down', {
serviceId: service.id,
from: currentReplicas,
to: newReplicas,
reason: 'low-utilization',
})
}
}
}
}, 30000) // Check every 30 secondsHorizontal Scaling
Distribute load across multiple instances:
// Load balancing
const loadBalancer = {
algorithm: 'least-connections', // or 'round-robin', 'ip-hash'
healthCheck: {
path: '/health',
interval: 10, // seconds
timeout: 5,
unhealthyThreshold: 3,
healthyThreshold: 2,
},
}
// Route requests to healthy instances
async function routeRequest(request: ServiceRequest) {
const instances = await getHealthyInstances(request.serviceId)
if (instances.length === 0) {
throw new Error('No healthy instances available')
}
// Select instance based on algorithm
const instance = selectInstance(instances, loadBalancer.algorithm, request)
// Execute on selected instance
return await instance.execute(request)
}
function selectInstance(instances, algorithm, request) {
switch (algorithm) {
case 'round-robin':
return instances[request.id % instances.length]
case 'least-connections':
return instances.reduce((least, current) => (current.activeConnections < least.activeConnections ? current : least))
case 'ip-hash':
const hash = hashCode(request.clientIp)
return instances[hash % instances.length]
default:
return instances[0]
}
}Maintenance
Database Migrations
Handle schema changes safely:
// Migration system
const migrations = [
{
version: '1.0.0',
up: async (db) => {
await db.createTable('service_requests', {
id: 'uuid',
serviceId: 'uuid',
customerId: 'uuid',
inputs: 'jsonb',
status: 'varchar',
createdAt: 'timestamp',
})
},
down: async (db) => {
await db.dropTable('service_requests')
},
},
{
version: '1.1.0',
up: async (db) => {
await db.addColumn('service_requests', 'priority', 'integer', {
default: 0,
})
},
down: async (db) => {
await db.removeColumn('service_requests', 'priority')
},
},
]
// Run migrations
async function migrate(targetVersion?: string) {
const current = await getCurrentVersion()
const pending = migrations.filter((m) => compareVersions(m.version, current) > 0 && (!targetVersion || compareVersions(m.version, targetVersion) <= 0))
for (const migration of pending) {
logger.info('migration.start', { version: migration.version })
try {
await migration.up(db)
await db.create($.Migration, {
version: migration.version,
appliedAt: new Date(),
})
logger.info('migration.complete', { version: migration.version })
} catch (error) {
logger.error('migration.failed', {
version: migration.version,
error: error.message,
})
throw error
}
}
}Configuration Updates
Update service configuration without downtime:
// Hot reload configuration
on.Service.configUpdated, async (update) => {
const service = await db.findOne($.Service, { id: update.serviceId })
// Apply new configuration
service.configuration = {
...service.configuration,
...update.changes,
}
// Notify running instances
send.ServiceInstance.reload, {
serviceId: service.id,
configuration: service.configuration,
})
await db.update(service, {
configuration: service.configuration,
updatedAt: new Date(),
})
logger.info('service.config.updated', {
serviceId: service.id,
changes: update.changes,
})
})Backup & Recovery
Ensure data safety:
// Automated backups
setInterval(
async () => {
const services = await db.query($.Service, {
where: { backup: { enabled: true } },
})
for (const service of services) {
try {
// Backup service data
const backup = await createBackup(service)
await db.create($.Backup, {
serviceId: service.id,
type: 'automated',
size: backup.size,
location: backup.location,
createdAt: new Date(),
})
// Cleanup old backups
const oldBackups = await db.query($.Backup, {
where: {
serviceId: service.id,
createdAt: { lt: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000) },
},
})
for (const old of oldBackups) {
await deleteBackup(old)
await db.delete(old)
}
} catch (error) {
logger.error('backup.failed', {
serviceId: service.id,
error: error.message,
})
}
}
},
24 * 60 * 60 * 1000
) // Daily backups
// Restore from backup
async function restoreFromBackup(serviceId: string, backupId: string) {
const backup = await db.findOne($.Backup, { id: backupId })
logger.info('restore.start', {
serviceId,
backupId,
backupDate: backup.createdAt,
})
try {
// Stop service
await $.Service.stop({ serviceId })
// Restore data
await restoreBackup(backup)
// Start service
await $.Service.start({ serviceId })
logger.info('restore.complete', { serviceId, backupId })
} catch (error) {
logger.error('restore.failed', {
serviceId,
backupId,
error: error.message,
})
throw error
}
}Best Practices
1. Zero-Downtime Deployments
Always use gradual rollouts
2. Comprehensive Monitoring
Track metrics, logs, and traces
3. Automated Testing
Test before production deployment
4. Disaster Recovery Plan
Document and test recovery procedures
5. Cost Optimization
Monitor and optimize resource usage
Next Steps
- Explore Examples → - See deployed services
- Learn Best Practices → - Optimize operations