terraform-google-ona-runner/runner-vm.tf at main · gitpod-io/terraform-google-ona-runner · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
# Runner VM Instance Group Module
# Deploy runner service in a container on Compute Engine VM instances

locals {
  auth_proxy_url = "https://4430s--${var.runner_id}.${var.runner_domain}/initial-spec"

  proxy_enabled = var.proxy_config != null
  ca_enabled    = var.ca_certificate != null

  http_proxy  = local.proxy_enabled ? var.proxy_config.http_proxy : ""
  https_proxy = local.proxy_enabled ? var.proxy_config.https_proxy : ""
  all_proxy   = local.proxy_enabled ? var.proxy_config.all_proxy : ""
  # we add some default values to the no_proxy variable along with the customer provided values
  no_proxy = local.proxy_enabled ? "${var.proxy_config.no_proxy},localhost,127.0.0.1,googleapis.com,metadata.google.internal,${var.runner_domain}" : ""

  # Trust bundle certificate GCS bucket and object info
  ca_bucket_name = local.has_certificates ? google_storage_bucket.runner_assets.name : ""
  ca_object_name = local.has_certificates ? google_storage_bucket_object.trust_bundle[0].name : ""

  # Agent storage bucket (only created when agents are enabled)
  agent_bucket_name = var.enable_agents ? google_storage_bucket.agent_storage[0].name : ""
}

# ================================
# TLS CERTIFICATE FOR AUTH PROXY
# ================================


# Time-based rotation trigger - rotates certificates every 30 days
resource "time_rotating" "auth_proxy_cert_rotation" {
  rotation_days = 30
}

# Create a self-signed certificate for auth proxy internal use
resource "tls_private_key" "auth_proxy" {
  algorithm = "RSA"
  rsa_bits  = 2048

  # Force recreation when rotation time changes
  lifecycle {
    create_before_destroy = true
  }
}

resource "tls_self_signed_cert" "auth_proxy" {
  private_key_pem = tls_private_key.auth_proxy.private_key_pem

  subject {
    common_name  = "${var.runner_name}-auth-proxy.internal"
    organization = "Gitpod"
  }

  validity_period_hours = 8760 # 1 year

  # Force recreation when rotation time changes
  lifecycle {
    create_before_destroy = true
  }

  allowed_uses = [
    "key_encipherment",
    "digital_signature",
    "server_auth",
  ]

  dns_names = [
    "${var.runner_name}-auth-proxy.internal",
    "auth-proxy.internal",
    "localhost"
  ]

  ip_addresses = [
    "127.0.0.1"
  ]
}


# Artifact Registry for container images
resource "google_artifact_registry_repository" "runner" {
  location      = var.region
  repository_id = "gitpod-cache-${var.runner_id}"
  description   = "Container images for Ona runner"
  format        = "DOCKER"
  project       = var.project_id
  kms_key_name  = local.kms_key_name

  labels = local.runner_labels

  docker_config {
    immutable_tags = true
  }

  cleanup_policies {
    id     = "expire-old-images"
    action = "DELETE"
    condition {
      tag_state  = "ANY"
      older_than = "2592000s" # 30 days
    }
  }
}
# Cloud-init configuration for runner VMs
data "cloudinit_config" "runner" {
  gzip          = false
  base64_encode = false

  part {
    content_type = "text/cloud-config"
    content = templatefile("${path.module}/files/runner-cloud-init.tftpl", {
      RUNNER_ID                            = var.runner_id
      PROJECT_ID                           = var.project_id
      REGION                               = var.region
      ZONES                                = join(",", var.zones)
      VPC_NAME                             = var.vpc_name
      VPC_PROJECT_ID                       = local.vpc_project_id
      SUBNET_NAME                          = var.runner_subnet_name
      RUNNER_TOKEN_SECRET                  = google_secret_manager_secret.runner_token.secret_id
      REDIS_CREDENTIALS_SECRET             = google_secret_manager_secret.redis_auth.secret_id
      SERVICE_ACCOUNT_EMAIL                = local.runner_sa_email
      ENVIRONMENT_VM_SERVICE_ACCOUNT_EMAIL = local.environment_vm_sa_email
      ARTIFACT_REGISTRY_HOST               = "${var.region}-docker.pkg.dev"
      API_ENDPOINT                         = var.api_endpoint
      BUILD_CACHE_BUCKET                   = google_storage_bucket.build_cache.name
      PROXY_DOMAIN                         = var.runner_domain
      SSH_PORT                             = var.ssh_port
      INSTANCE_GROUP_NAME                  = "${var.runner_name}-group"
      RUNNER_IMAGE_URL                     = var.development_version != "" ? local.runner_dev_image : local.runner_image
      DEVELOPMENT_VERSION                  = var.development_version
      PUBSUB_SUBSCRIPTION_ID               = google_pubsub_subscription.compute_events.name
      AUTH_PROXY_URL                       = local.auth_proxy_url
      AUTH_PROXY_TLS_CERT                  = tls_self_signed_cert.auth_proxy.cert_pem
      AUTH_PROXY_TLS_KEY                   = tls_private_key.auth_proxy.private_key_pem
      RUNNER_LOGS_URL                      = local.logs_url
      PROMETHEUS_IMAGE                     = local.prometheus_image
      NODE_EXPORTER_IMAGE                  = local.node_exporter_image
      LOADBALANCER_TYPE                    = var.loadbalancer_type
      CERTIFICATE_ID                       = var.certificate_id
      CERTIFICATE_SECRET_ID                = var.certificate_secret_id
      METRICS_SECRET_ID                    = "${var.runner_id}-metrics"
      ENABLE_AGENTS                        = var.enable_agents
      AGENT_BUCKET_NAME                    = local.agent_bucket_name
      RUNNER_ASSETS_BUCKET_NAME            = google_storage_bucket.runner_assets.name
      HONEYCOMB_API_KEY                    = var.honeycomb_api_key
      MIG_WARM_POOL_ENABLED                = var.mig_warm_pool_enabled
      # Proxy configuration
      HTTP_PROXY  = local.http_proxy
      HTTPS_PROXY = local.https_proxy
      NO_PROXY    = local.no_proxy
      ALL_PROXY   = local.all_proxy
      # CA certificate configuration
      CA_ENABLED       = local.ca_enabled
      HAS_TRUST_BUNDLE = local.has_certificates
      CA_BUCKET_NAME   = local.ca_bucket_name
      CA_OBJECT_NAME   = local.ca_object_name
      # Docker config configuration
      DOCKER_CONFIG_ENABLED     = local.docker_config_enabled
      DOCKER_CONFIG_BUCKET_NAME = local.docker_config_bucket_name
      DOCKER_CONFIG_OBJECT_NAME = local.docker_config_object_name
      # Insecure registries configuration
      INSECURE_REGISTRIES_ENABLED = local.insecure_registries_enabled
      INSECURE_REGISTRIES_JSON    = local.insecure_registries_json
      # CMEK configuration
      KMS_KEY_NAME = local.kms_key_name
      # Custom image registry configuration
      RUNNER_USES_CUSTOM_IMAGE = local.runner_uses_custom_image
      CUSTOM_RUNNER_REGISTRY   = local.custom_runner_registry
      # Environment VM labels configuration
      ENVIRONMENT_VM_LABELS = join(",", [for k, v in var.labels : "${k}=${v}"])
    })
  }
}

# Create instance template for runner VMs
resource "google_compute_instance_template" "runner" {
  name_prefix  = "${var.runner_name}-runner-"
  project      = var.project_id
  machine_type = var.runner_vm_config.machine_type
  region       = var.region

  tags = ["gitpod-runner", "gitpod-type-runner", "allow-health-check", "lb-health-check", "gitpod-runner-${var.runner_id}"]

  labels = local.runner_labels

  disk {
    source_image = "cos-cloud/cos-stable"
    auto_delete  = true
    boot         = true
    disk_size_gb = 20
    disk_type    = "hyperdisk-balanced"

    # Optional CMEK encryption for boot disk
    dynamic "disk_encryption_key" {
      for_each = local.kms_key_name != null ? [1] : []
      content {
        kms_key_self_link = local.kms_key_name
      }
    }
  }

  shielded_instance_config {
    enable_secure_boot = true
  }


  network_interface {
    network            = "projects/${local.vpc_project_id}/global/networks/${var.vpc_name}"
    subnetwork         = var.runner_subnet_name
    subnetwork_project = local.vpc_project_id
    nic_type           = "GVNIC"
  }

  service_account {
    email = local.runner_sa_email
    scopes = [
      "https://www.googleapis.com/auth/logging.write",
      "https://www.googleapis.com/auth/monitoring.write",
      "https://www.googleapis.com/auth/compute",
      "https://www.googleapis.com/auth/devstorage.read_write",
      "https://www.googleapis.com/auth/pubsub",
      "https://www.googleapis.com/auth/cloud-platform"
    ]
  }

  # Container-Optimized OS metadata for running the runner container
  metadata = {
    google-logging-enabled       = "true"
    google-monitoring-enabled    = "true"
    google-logging-use-fluentbit = "true"
    serial-port-logging-enable   = "true"

    # Cloud-init configuration for Prometheus setup
    user-data = data.cloudinit_config.runner.rendered

    "cos-metrics-enabled" = "true"
  }


  lifecycle {
    create_before_destroy = true
  }
}

# Create managed instance group
resource "google_compute_region_instance_group_manager" "runner" {
  # enables features like min_ready_sec
  provider = google-beta

  name                      = "${var.runner_name}-group"
  region                    = var.region
  project                   = var.project_id
  base_instance_name        = var.runner_name
  distribution_policy_zones = var.zones

  instance_lifecycle_policy {
    default_action_on_failure = "REPAIR"
    force_update_on_repair    = "NO"
  }

  version {
    instance_template = google_compute_instance_template.runner.id
  }

  named_port {
    name = "http"
    port = 8080
  }

  named_port {
    name = "health"
    port = 9091
  }

  auto_healing_policies {
    health_check      = google_compute_health_check.runner.id
    initial_delay_sec = 120 # Reduced from 180 for faster deletion while allowing startup
  }

  update_policy {
    # Use rolling update for zero-downtime deployments
    type                         = "PROACTIVE"
    instance_redistribution_type = "PROACTIVE"

    # Configurable actions for different update scenarios
    minimal_action                 = var.runner_vm_config.update_policy_config.minimal_action
    most_disruptive_allowed_action = "REPLACE"

    # Rolling update configuration optimized for self-updating runner
    # Surge-first strategy: create new instances before destroying old ones
    # This ensures the updater instance survives until the very end
    max_surge_fixed       = max(length(var.zones), 2)
    max_unavailable_fixed = var.runner_vm_config.update_policy_config.max_unavailable == 0 ? 0 : max(length(var.zones), var.runner_vm_config.update_policy_config.max_unavailable)

    # Use SUBSTITUTE method to create new instances before destroying old ones
    replacement_method = "SUBSTITUTE"

    # Slower, safer updates to ensure stability - aligned with health check timing
    min_ready_sec = 120 # 2 minutes to allow for container startup and initial health checks
  }

  wait_for_instances = true

  # Ensure Redis cache is ready before creating runner instances
  depends_on = [google_redis_cluster.cache]

  lifecycle {
    create_before_destroy = true
  }
}

# Create autoscaler for the instance group
resource "google_compute_region_autoscaler" "runner" {
  name    = "${var.runner_name}-autoscaler"
  region  = var.region
  target  = google_compute_region_instance_group_manager.runner.id
  project = var.project_id

  autoscaling_policy {
    min_replicas    = 1 # Always maintain at least 1 runner instance
    max_replicas    = 2 # Allow up to 2 instances for rollouts, scale down to min after
    cooldown_period = 60

    cpu_utilization {
      target = 0.7
    }
  }
}

# Health check for runner service
resource "google_compute_health_check" "runner" {
  name    = "${var.runner_name}-health"
  project = var.project_id

  timeout_sec         = 10 # Increased timeout for slow container responses and network issues
  check_interval_sec  = 20 # Less frequent checks to reduce load during startup
  healthy_threshold   = 2  # Still require 2 consecutive successes
  unhealthy_threshold = 6  # More tolerance for temporary failures during startup

  # Enable detailed logging for debugging health check failures
  log_config {
    enable = true
  }

  http_health_check {
    port         = 9091
    request_path = "/_health"
  }

  lifecycle {
    create_before_destroy = true
  }
}


# Resource tagging for lifecycle management
resource "google_compute_project_metadata" "runner_metadata" {
  project = var.project_id

  metadata = {
    "enable-oslogin"   = "TRUE"
    "gitpod-runner-id" = var.runner_id
  }
}