.do

Deployment & Operations

Deploy, monitor, and maintain services in production

Learn how to deploy services to production and operate them reliably at scale.

Pre-Deployment Checklist

Before deploying any service, ensure:

const deploymentChecklist = {
  // Code quality
  code: {
    testsPass: true, // All tests green
    linted: true, // No linting errors
    typeChecked: true, // Type safety verified
    reviewed: true, // Code review complete
  },

  // Service definition
  service: {
    documented: true, // API docs complete
    versioned: true, // Semantic version assigned
    pricing: true, // Pricing model defined
    sla: true, // SLA commitments set
  },

  // Infrastructure
  infrastructure: {
    resourcesProvisioned: true, // CPU/memory allocated
    secretsConfigured: true, // API keys, credentials set
    monitoringSetup: true, // Metrics and alerts configured
    backupsEnabled: true, // Data backup strategy in place
  },

  // Security
  security: {
    authenticationEnabled: true, // Auth required
    inputValidation: true, // Input sanitization
    rateLimiting: true, // Rate limits configured
    encryptionEnabled: true, // Data encryption
  },

  // Business
  business: {
    termsOfService: true, // Legal terms defined
    privacyPolicy: true, // Privacy compliance
    pricingApproved: true, // Pricing strategy approved
    supportPlan: true, // Customer support plan
  },
}

// Validate checklist
async function validateDeployment(service: Service) {
  const results = []

  for (const [category, checks] of Object.entries(deploymentChecklist)) {
    for (const [check, required] of Object.entries(checks)) {
      const passed = await validateCheck(service, category, check)

      if (required && !passed) {
        results.push({
          category,
          check,
          status: 'failed',
          blocking: true,
        })
      }
    }
  }

  return {
    ready: results.every((r) => r.status !== 'failed'),
    issues: results.filter((r) => r.status === 'failed'),
  }
}

Deployment Process

1. Staging Deployment

Test in production-like environment:

import $, { db } from 'sdk.do'

// Deploy to staging
async function deployToStaging(service: Service) {
  // Create staging instance
  const staging = await $.Service.deploy({
    serviceId: service.id,
    environment: 'staging',
    configuration: {
      maxConcurrency: 10,
      timeout: 60000,
      resources: {
        cpu: '1000m',
        memory: '512Mi',
      },
    },
  })

  // Run smoke tests
  const smokeTests = await runSmokeTests(staging)

  if (!smokeTests.passed) {
    throw new Error(`Smoke tests failed: ${smokeTests.failures}`)
  }

  // Run integration tests
  const integrationTests = await runIntegrationTests(staging)

  if (!integrationTests.passed) {
    throw new Error(`Integration tests failed: ${integrationTests.failures}`)
  }

  // Load test
  const loadTest = await runLoadTest(staging, {
    concurrency: 50,
    duration: 300, // 5 minutes
    targetRPS: 10,
  })

  if (loadTest.errorRate > 0.01) {
    throw new Error(`Load test error rate too high: ${loadTest.errorRate}`)
  }

  return {
    staging,
    tests: { smokeTests, integrationTests, loadTest },
  }
}

2. Production Deployment

Deploy with zero downtime:

// Blue-green deployment
async function deployToProduction(service: Service) {
  // Deploy new version (green)
  const green = await $.Service.deploy({
    serviceId: service.id,
    environment: 'production',
    version: service.version,
    configuration: {
      maxConcurrency: 100,
      timeout: 60000,
      resources: {
        cpu: '4000m',
        memory: '2Gi',
      },
      replicas: 3,
      autoScaling: {
        minReplicas: 3,
        maxReplicas: 10,
        targetCPU: 70,
      },
    },
  })

  // Health check
  await waitForHealthy(green, { timeout: 300000 })

  // Canary deployment: 10% traffic
  await $.Service.route({
    serviceId: service.id,
    versions: [
      { version: service.version - 1, weight: 90 }, // Old (blue)
      { version: service.version, weight: 10 }, // New (green)
    ],
  })

  // Monitor canary
  await sleep(600000) // 10 minutes

  const canaryMetrics = await getMetrics(green, { period: '10m' })

  if (canaryMetrics.errorRate > 0.01) {
    // Rollback
    await $.Service.route({
      serviceId: service.id,
      versions: [{ version: service.version - 1, weight: 100 }],
    })

    throw new Error('Canary deployment failed, rolled back')
  }

  // Gradual rollout: 50% traffic
  await $.Service.route({
    serviceId: service.id,
    versions: [
      { version: service.version - 1, weight: 50 },
      { version: service.version, weight: 50 },
    ],
  })

  await sleep(600000) // 10 minutes

  // Full rollout: 100% traffic
  await $.Service.route({
    serviceId: service.id,
    versions: [{ version: service.version, weight: 100 }],
  })

  // Cleanup old version
  await sleep(3600000) // Wait 1 hour
  await $.Service.undeploy({
    serviceId: service.id,
    version: service.version - 1,
  })

  return green
}

3. Rollback Strategy

Quick rollback if issues occur:

// Automatic rollback on errors
on.Service.errorRate, async (event) => {
  if (event.rate > 0.05) {
    // 5% error rate threshold
    const service = await db.findOne($.Service, { id: event.serviceId })

    // Get previous stable version
    const previousVersion = await db.findOne($.ServiceDeployment, {
      serviceId: service.id,
      status: 'stable',
      version: { lt: service.version },
    })

    if (previousVersion) {
      // Rollback
      await $.Service.rollback({
        serviceId: service.id,
        toVersion: previousVersion.version,
      })

      // Alert team
      send.Alert.create, {
        severity: 'critical',
        title: 'Service rolled back due to high error rate',
        service: service.name,
        errorRate: event.rate,
        rolledBackTo: previousVersion.version,
      })
    }
  }
})

Monitoring & Observability

Metrics Collection

Track key performance indicators:

import { db, on } from 'sdk.do'

// Service metrics
const serviceMetrics = {
  // Performance
  responseTime: 'histogram',
  throughput: 'counter',
  errorRate: 'gauge',
  successRate: 'gauge',

  // Resources
  cpuUsage: 'gauge',
  memoryUsage: 'gauge',
  activeRequests: 'gauge',

  // Business
  revenue: 'counter',
  customers: 'gauge',
  conversionRate: 'gauge',
}

// Collect metrics on every request
on.ServiceRequest.created, async (request) => {
  const start = Date.now()

  try {
    // Execute service
    const result = await executeService(request)

    // Record success metrics
    const duration = Date.now() - start

    await recordMetric('service.response_time', duration, {
      serviceId: request.serviceId,
      status: 'success',
    })

    await recordMetric('service.throughput', 1, {
      serviceId: request.serviceId,
    })

    await recordMetric('service.revenue', result.cost, {
      serviceId: request.serviceId,
    })
  } catch (error) {
    // Record error metrics
    await recordMetric('service.errors', 1, {
      serviceId: request.serviceId,
      errorType: error.type,
    })

    await recordMetric('service.error_rate', 1, {
      serviceId: request.serviceId,
    })
  }
})

// Calculate aggregate metrics periodically
setInterval(async () => {
  const services = await db.query($.Service, {
    where: { status: 'deployed' },
  })

  for (const service of services) {
    // Calculate success rate
    const metrics = await getMetrics(service.id, { period: '5m' })

    const successRate = metrics.throughput > 0 ? 1 - metrics.errors / metrics.throughput : 1

    await recordMetric('service.success_rate', successRate, {
      serviceId: service.id,
    })

    // Update service health
    await db.update(service, {
      health: {
        successRate,
        averageResponseTime: metrics.responseTime.p95,
        errorRate: metrics.errors / metrics.throughput,
        lastUpdated: new Date(),
      },
    })
  }
}, 60000) // Every minute

Logging

Structured logging for debugging:

import { logger } from 'sdk.do'

on.ServiceRequest.created, async (request) => {
  // Request logging
  logger.info('service.request.start', {
    requestId: request.id,
    serviceId: request.serviceId,
    customerId: request.customerId,
    inputs: sanitizeInputs(request.inputs),
  })

  try {
    const result = await executeService(request)

    // Success logging
    logger.info('service.request.complete', {
      requestId: request.id,
      duration: result.duration,
      cost: result.cost,
      outputSize: JSON.stringify(result.outputs).length,
    })
  } catch (error) {
    // Error logging
    logger.error('service.request.failed', {
      requestId: request.id,
      error: error.message,
      stack: error.stack,
      inputs: sanitizeInputs(request.inputs),
    })
  }
})

// Performance logging
async function executeWithLogging(fn, context) {
  const start = Date.now()

  logger.debug('execution.start', context)

  try {
    const result = await fn()

    logger.debug('execution.complete', {
      ...context,
      duration: Date.now() - start,
    })

    return result
  } catch (error) {
    logger.error('execution.failed', {
      ...context,
      duration: Date.now() - start,
      error: error.message,
    })

    throw error
  }
}

Alerting

Proactive monitoring with alerts:

// Alert rules
const alertRules = [
  {
    name: 'High Error Rate',
    condition: 'service.error_rate > 0.05',
    duration: '5m',
    severity: 'critical',
    action: async (service) => {
      send.Alert.create, {
        title: `High error rate on ${service.name}`,
        description: `Error rate: ${service.health.errorRate}`,
        severity: 'critical',
        service: service.id,
      })

      // Auto-scale to handle load
      await $.Service.scale({
        serviceId: service.id,
        replicas: { add: 2 },
      })
    },
  },
  {
    name: 'Slow Response Time',
    condition: 'service.response_time.p95 > 30000',
    duration: '10m',
    severity: 'warning',
    action: async (service) => {
      send.Alert.create, {
        title: `Slow response time on ${service.name}`,
        description: `P95 response time: ${service.health.averageResponseTime}ms`,
        severity: 'warning',
        service: service.id,
      })
    },
  },
  {
    name: 'High Memory Usage',
    condition: 'service.memory_usage > 0.85',
    duration: '5m',
    severity: 'warning',
    action: async (service) => {
      send.Alert.create, {
        title: `High memory usage on ${service.name}`,
        severity: 'warning',
        service: service.id,
      })

      // Restart service to clear memory
      await $.Service.restart({
        serviceId: service.id,
        graceful: true,
      })
    },
  },
]

// Evaluate alert rules
setInterval(async () => {
  const services = await db.query($.Service, {
    where: { status: 'deployed' },
  })

  for (const service of services) {
    for (const rule of alertRules) {
      const triggered = evaluateCondition(rule.condition, service)

      if (triggered) {
        // Check if already alerted recently
        const recentAlert = await db.findOne($.Alert, {
          serviceId: service.id,
          rule: rule.name,
          createdAt: { gte: new Date(Date.now() - rule.duration * 60000) },
        })

        if (!recentAlert) {
          await rule.action(service)
        }
      }
    }
  }
}, 60000) // Check every minute

Scaling

Auto-Scaling

Automatically scale based on load:

// Auto-scaling configuration
const autoScaling = {
  minReplicas: 2,
  maxReplicas: 20,
  metrics: {
    cpu: {
      target: 70, // 70% CPU utilization
      scaleUp: 80,
      scaleDown: 30,
    },
    memory: {
      target: 75,
      scaleUp: 85,
      scaleDown: 40,
    },
    requestRate: {
      target: 100, // requests per second per replica
      scaleUp: 150,
      scaleDown: 50,
    },
  },
  cooldown: {
    scaleUp: 60, // Wait 60s between scale ups
    scaleDown: 300, // Wait 5min between scale downs
  },
}

// Auto-scaling logic
setInterval(async () => {
  const services = await db.query($.Service, {
    where: { autoScaling: { enabled: true } },
  })

  for (const service of services) {
    const metrics = await getMetrics(service.id, { period: '5m' })
    const currentReplicas = service.deployment.replicas

    // Check if should scale up
    if (
      metrics.cpu > autoScaling.metrics.cpu.scaleUp ||
      metrics.memory > autoScaling.metrics.memory.scaleUp ||
      metrics.requestRate > autoScaling.metrics.requestRate.scaleUp
    ) {
      const newReplicas = Math.min(currentReplicas + 1, autoScaling.maxReplicas)

      if (newReplicas > currentReplicas) {
        await $.Service.scale({
          serviceId: service.id,
          replicas: newReplicas,
        })

        logger.info('service.scaled.up', {
          serviceId: service.id,
          from: currentReplicas,
          to: newReplicas,
          reason: { cpu: metrics.cpu, memory: metrics.memory },
        })
      }
    }

    // Check if should scale down
    else if (
      metrics.cpu < autoScaling.metrics.cpu.scaleDown &&
      metrics.memory < autoScaling.metrics.memory.scaleDown &&
      metrics.requestRate < autoScaling.metrics.requestRate.scaleDown
    ) {
      const newReplicas = Math.max(currentReplicas - 1, autoScaling.minReplicas)

      if (newReplicas < currentReplicas) {
        await $.Service.scale({
          serviceId: service.id,
          replicas: newReplicas,
        })

        logger.info('service.scaled.down', {
          serviceId: service.id,
          from: currentReplicas,
          to: newReplicas,
          reason: 'low-utilization',
        })
      }
    }
  }
}, 30000) // Check every 30 seconds

Horizontal Scaling

Distribute load across multiple instances:

// Load balancing
const loadBalancer = {
  algorithm: 'least-connections', // or 'round-robin', 'ip-hash'
  healthCheck: {
    path: '/health',
    interval: 10, // seconds
    timeout: 5,
    unhealthyThreshold: 3,
    healthyThreshold: 2,
  },
}

// Route requests to healthy instances
async function routeRequest(request: ServiceRequest) {
  const instances = await getHealthyInstances(request.serviceId)

  if (instances.length === 0) {
    throw new Error('No healthy instances available')
  }

  // Select instance based on algorithm
  const instance = selectInstance(instances, loadBalancer.algorithm, request)

  // Execute on selected instance
  return await instance.execute(request)
}

function selectInstance(instances, algorithm, request) {
  switch (algorithm) {
    case 'round-robin':
      return instances[request.id % instances.length]

    case 'least-connections':
      return instances.reduce((least, current) => (current.activeConnections < least.activeConnections ? current : least))

    case 'ip-hash':
      const hash = hashCode(request.clientIp)
      return instances[hash % instances.length]

    default:
      return instances[0]
  }
}

Maintenance

Database Migrations

Handle schema changes safely:

// Migration system
const migrations = [
  {
    version: '1.0.0',
    up: async (db) => {
      await db.createTable('service_requests', {
        id: 'uuid',
        serviceId: 'uuid',
        customerId: 'uuid',
        inputs: 'jsonb',
        status: 'varchar',
        createdAt: 'timestamp',
      })
    },
    down: async (db) => {
      await db.dropTable('service_requests')
    },
  },
  {
    version: '1.1.0',
    up: async (db) => {
      await db.addColumn('service_requests', 'priority', 'integer', {
        default: 0,
      })
    },
    down: async (db) => {
      await db.removeColumn('service_requests', 'priority')
    },
  },
]

// Run migrations
async function migrate(targetVersion?: string) {
  const current = await getCurrentVersion()

  const pending = migrations.filter((m) => compareVersions(m.version, current) > 0 && (!targetVersion || compareVersions(m.version, targetVersion) <= 0))

  for (const migration of pending) {
    logger.info('migration.start', { version: migration.version })

    try {
      await migration.up(db)

      await db.create($.Migration, {
        version: migration.version,
        appliedAt: new Date(),
      })

      logger.info('migration.complete', { version: migration.version })
    } catch (error) {
      logger.error('migration.failed', {
        version: migration.version,
        error: error.message,
      })

      throw error
    }
  }
}

Configuration Updates

Update service configuration without downtime:

// Hot reload configuration
on.Service.configUpdated, async (update) => {
  const service = await db.findOne($.Service, { id: update.serviceId })

  // Apply new configuration
  service.configuration = {
    ...service.configuration,
    ...update.changes,
  }

  // Notify running instances
  send.ServiceInstance.reload, {
    serviceId: service.id,
    configuration: service.configuration,
  })

  await db.update(service, {
    configuration: service.configuration,
    updatedAt: new Date(),
  })

  logger.info('service.config.updated', {
    serviceId: service.id,
    changes: update.changes,
  })
})

Backup & Recovery

Ensure data safety:

// Automated backups
setInterval(
  async () => {
    const services = await db.query($.Service, {
      where: { backup: { enabled: true } },
    })

    for (const service of services) {
      try {
        // Backup service data
        const backup = await createBackup(service)

        await db.create($.Backup, {
          serviceId: service.id,
          type: 'automated',
          size: backup.size,
          location: backup.location,
          createdAt: new Date(),
        })

        // Cleanup old backups
        const oldBackups = await db.query($.Backup, {
          where: {
            serviceId: service.id,
            createdAt: { lt: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000) },
          },
        })

        for (const old of oldBackups) {
          await deleteBackup(old)
          await db.delete(old)
        }
      } catch (error) {
        logger.error('backup.failed', {
          serviceId: service.id,
          error: error.message,
        })
      }
    }
  },
  24 * 60 * 60 * 1000
) // Daily backups

// Restore from backup
async function restoreFromBackup(serviceId: string, backupId: string) {
  const backup = await db.findOne($.Backup, { id: backupId })

  logger.info('restore.start', {
    serviceId,
    backupId,
    backupDate: backup.createdAt,
  })

  try {
    // Stop service
    await $.Service.stop({ serviceId })

    // Restore data
    await restoreBackup(backup)

    // Start service
    await $.Service.start({ serviceId })

    logger.info('restore.complete', { serviceId, backupId })
  } catch (error) {
    logger.error('restore.failed', {
      serviceId,
      backupId,
      error: error.message,
    })

    throw error
  }
}

Best Practices

1. Zero-Downtime Deployments

Always use gradual rollouts

2. Comprehensive Monitoring

Track metrics, logs, and traces

3. Automated Testing

Test before production deployment

4. Disaster Recovery Plan

Document and test recovery procedures

5. Cost Optimization

Monitor and optimize resource usage

Next Steps