This is an automated email from the ASF dual-hosted git repository.
damccorm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new 9524b56c148 Add observability (#37716)
9524b56c148 is described below
commit 9524b56c148f3dfe01ba12fff2b9d02ab576a160
Author: Tarun Annapareddy <[email protected]>
AuthorDate: Thu Feb 26 13:59:00 2026 -0800
Add observability (#37716)
* Add Observability Metrics
* Update script
* update readme
* fix readme
* update readme
* update port
* Update examples/terraform/envoy-ratelimiter/deploy.sh
Co-authored-by: gemini-code-assist[bot]
<176961590+gemini-code-assist[bot]@users.noreply.github.com>
* fix gemini review
---------
Co-authored-by: gemini-code-assist[bot]
<176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
examples/terraform/envoy-ratelimiter/README.md | 58 +++++++++--
examples/terraform/envoy-ratelimiter/deploy.sh | 66 ++++++++++++
.../terraform/envoy-ratelimiter/prerequisites.tf | 1 +
examples/terraform/envoy-ratelimiter/ratelimit.tf | 114 +++++++++++----------
examples/terraform/envoy-ratelimiter/variables.tf | 4 +-
5 files changed, 177 insertions(+), 66 deletions(-)
diff --git a/examples/terraform/envoy-ratelimiter/README.md
b/examples/terraform/envoy-ratelimiter/README.md
index bb974873591..b1275fbf200 100644
--- a/examples/terraform/envoy-ratelimiter/README.md
+++ b/examples/terraform/envoy-ratelimiter/README.md
@@ -38,7 +38,7 @@ Example Beam Java Pipelines using it:
- **Cloud NAT (Prerequisite)**: Allows private nodes to pull Docker images.
- **Envoy Rate Limit Service**: A stateless Go/gRPC service that handles rate
limit logic.
- **Redis**: Stores the rate limit counters.
-- **StatsD Exporter**: Sidecar container that converts StatsD metrics to
Prometheus format, exposed on port `9102`.
+- **Prometheus Metrics**: Exposes Prometheus metrics on port `9090`. These
metrics are exported to Google Cloud Monitoring.
- **Internal Load Balancer**: A Google Cloud TCP Load Balancer exposing the
Rate Limit service internally within the VPC.
## Prerequisites:
@@ -82,7 +82,7 @@ cluster_name = "ratelimit-cluster" # Name of
the GKE cluster
deletion_protection = true # Prevent accidental
cluster deletion (set "true" for prod)
control_plane_cidr = "172.16.0.0/28" # CIDR for GKE control
plane (must not overlap with subnet)
namespace = "envoy-ratelimiter" # Kubernetes namespace for
deployment
-enable_metrics = false # Deploy statsd-exporter
sidecar
+enable_metrics = true # Enable metrics export to
Google Cloud Monitoring
ratelimit_replicas = 1 # Initial number of Rate
Limit pods
min_replicas = 1 # Minimum HPA replicas
max_replicas = 5 # Maximum HPA replicas
@@ -110,25 +110,34 @@ EOF
```
# Deploy Envoy Rate Limiter:
-1. Initialize Terraform to download providers and modules:
+
+1. **Deploy Script (Recommended)**:
+Run the helper script to handle the deployment process automatically:
```bash
-terraform init
+./deploy.sh
```
+The script will provide the ip address of the load balancer once the
deployment is complete.
-2. Plan and apply the changes:
+2. **Deploy (Manual Alternative)**:
+If you prefer running Terraform manually, you can use the following commands:
```bash
-terraform plan -out=tfplan
-terraform apply tfplan
+# Step 1: Initialize Terraform
+terraform init
+
+# Step 2: Create Cluster
+terraform apply -target=time_sleep.wait_for_cluster
+
+# Step 3: Create Resources
+terraform apply
```
-3. Connect to the service:
After deployment, get the **Internal** IP address:
```bash
terraform output load_balancer_ip
```
The service is accessible **only from within the VPC** (e.g., via Dataflow
workers or GCE instances in the same network) at `<INTERNAL_IP>:8081`.
-4. **Test with Dataflow Workflow**:
+3. **Test with Dataflow Workflow**:
Verify connectivity and rate limiting logic by running the example Dataflow
pipeline.
```bash
@@ -150,11 +159,40 @@ The service is accessible **only from within the VPC**
(e.g., via Dataflow worke
```
+# Observability & Metrics:
+This module supports exporting native Prometheus metrics to **Google Cloud
Monitoring**.
+
+ `enable_metrics` is set to `true` by default.
+
+### Sample Metrics
+| Metric Name | Description |
+| :--- | :--- |
+| `ratelimit_service_rate_limit_total_hits` | Total rate limit requests
received. |
+| `ratelimit_service_rate_limit_over_limit` | Requests that exceeded the limit
(HTTP 429). |
+| `ratelimit_service_rate_limit_near_limit` | Requests that are approaching
the limit. |
+| `ratelimit_service_call_should_rate_limit` | Total valid gRPC calls to the
service. |
+
+*Note: You will also see many other Go runtime metrics (`go_*`) and Redis
client metrics (`redis_*`)
+
+### Viewing in Google Cloud Console
+1. Go to **Monitoring** > **Metrics Explorer**.
+2. Click **Select a metric**.
+3. Search for `ratelimit` and select **Prometheus Target** > **ratelimit**.
+4. Select a metric (e.g., `ratelimit_service_rate_limit_over_limit`) and click
**Apply**.
+5. Use **Filters** to drill down by `domain`, `key`, and `value` (e.g.,
`key=database`, `value=users`).
+
# Clean up resources:
To destroy the cluster and all created resources:
+
+```bash
+./deploy.sh destroy
+```
+
+Alternatively:
```bash
terraform destroy
```
+
*Note: If `deletion_protection` was enabled, you must set it to `false` in
`terraform.tfvars` before destroying.*
# Variables description:
@@ -169,7 +207,7 @@ terraform destroy
|control_plane_cidr |CIDR block for GKE control plane
|172.16.0.0/28 |
|cluster_name |Name of the GKE cluster
|ratelimit-cluster |
|namespace |Kubernetes namespace to deploy resources into
|envoy-ratelimiter |
-|enable_metrics |Deploy statsd-exporter sidecar
|false |
+|enable_metrics |Enable metrics export to Google Cloud Monitoring
|true |
|deletion_protection |Prevent accidental cluster deletion
|false |
|ratelimit_replicas |Initial number of Rate Limit pods
|1 |
|min_replicas |Minimum HPA replicas
|1 |
diff --git a/examples/terraform/envoy-ratelimiter/deploy.sh
b/examples/terraform/envoy-ratelimiter/deploy.sh
new file mode 100755
index 00000000000..2ac0e081f7e
--- /dev/null
+++ b/examples/terraform/envoy-ratelimiter/deploy.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script deploys the Envoy Rate Limiter on GKE.
+
+set -e
+
+COMMAND=${1:-"apply"}
+
+# 1. Initialize Terraform
+if [ ! -d ".terraform" ]; then
+ echo "Initializing Terraform..."
+ terraform init
+else
+ # Verify terraform initialization is valid, or re-initialize
+ terraform init -upgrade=false >/dev/null 2>&1 || terraform init
+fi
+
+if [ "$COMMAND" = "destroy" ]; then
+ echo "Destroying Envoy Rate Limiter Resources..."
+ terraform destroy -auto-approve
+ exit $?
+fi
+
+if [ "$COMMAND" = "apply" ]; then
+ echo "Deploying Envoy Rate Limiter..."
+
+ echo "--------------------------------------------------"
+ echo "Creating/Updating GKE Cluster..."
+ echo "--------------------------------------------------"
+ # Deploy the cluster and wait for it to be ready.
+ terraform apply -target=time_sleep.wait_for_cluster -auto-approve
+
+ echo ""
+ echo "--------------------------------------------------"
+ echo "Deploying Application Resources..."
+ echo "--------------------------------------------------"
+ # Deploy the rest of the resources
+ terraform apply -auto-approve
+
+ echo ""
+ echo "Deployment Complete!"
+ echo "Cluster Name: $(terraform output -raw cluster_name)"
+ echo "Load Balancer IP: $(terraform output -raw load_balancer_ip)"
+ exit 0
+fi
+
+echo "Usage:"
+echo " ./deploy.sh [apply] # Initialize and deploy resources (Default)"
+echo " ./deploy.sh destroy # Destroy resources"
+exit 1
diff --git a/examples/terraform/envoy-ratelimiter/prerequisites.tf
b/examples/terraform/envoy-ratelimiter/prerequisites.tf
index 41151fae91c..44f321457a2 100644
--- a/examples/terraform/envoy-ratelimiter/prerequisites.tf
+++ b/examples/terraform/envoy-ratelimiter/prerequisites.tf
@@ -21,6 +21,7 @@ resource "google_project_service" "required" {
"container",
"iam",
"compute",
+ "monitoring",
])
service = "${each.key}.googleapis.com"
diff --git a/examples/terraform/envoy-ratelimiter/ratelimit.tf
b/examples/terraform/envoy-ratelimiter/ratelimit.tf
index c95e48927cb..96638e23563 100644
--- a/examples/terraform/envoy-ratelimiter/ratelimit.tf
+++ b/examples/terraform/envoy-ratelimiter/ratelimit.tf
@@ -158,11 +158,36 @@ resource "kubernetes_deployment" "ratelimit" {
port {
container_port = 6070
}
+ dynamic "port" {
+ for_each = var.enable_metrics ? [1] : []
+ content {
+ name = "metrics"
+ container_port = 9090
+ }
+ }
env {
- name = "USE_STATSD"
+ name = "USE_PROMETHEUS"
value = var.enable_metrics ? "true" : "false"
}
+ dynamic "env" {
+ for_each = var.enable_metrics ? [1] : []
+ content {
+ name = "PROMETHEUS_ADDR"
+ value = ":9090"
+ }
+ }
+ dynamic "env" {
+ for_each = var.enable_metrics ? [1] : []
+ content {
+ name = "PROMETHEUS_PATH"
+ value = "/metrics"
+ }
+ }
+ env {
+ name = "USE_STATSD"
+ value = "false"
+ }
env {
name = "DISABLE_STATS"
value = var.enable_metrics ? "false" : "true"
@@ -203,14 +228,6 @@ resource "kubernetes_deployment" "ratelimit" {
name = "CONFIG_TYPE"
value = "FILE"
}
- env {
- name = "STATSD_HOST"
- value = "localhost"
- }
- env {
- name = "STATSD_PORT"
- value = "9125"
- }
env {
name = "GRPC_MAX_CONNECTION_AGE"
value = var.ratelimit_grpc_max_connection_age
@@ -231,41 +248,7 @@ resource "kubernetes_deployment" "ratelimit" {
}
}
- dynamic "container" {
- for_each = var.enable_metrics ? [1] : []
- content {
- name = "statsd-exporter"
- image = var.statsd_exporter_image
- args = ["--log.format=json"]
-
- dynamic "port" {
- for_each = var.enable_metrics ? [1] : []
- content {
- name = "metrics"
- container_port = 9102
- }
- }
- dynamic "port" {
- for_each = var.enable_metrics ? [1] : []
- content {
- name = "statsd-udp"
- container_port = 9125
- protocol = "UDP"
- }
- }
- # statsd-exporter does not use much resources, so setting
resources to the minimum
- resources {
- requests = {
- cpu = "50m"
- memory = "64Mi"
- }
- limits = {
- cpu = "100m"
- memory = "128Mi"
- }
- }
- }
- }
+
volume {
name = "config-volume"
@@ -361,8 +344,8 @@ resource "kubernetes_service" "ratelimit" {
for_each = var.enable_metrics ? [1] : []
content {
name = "metrics"
- port = 9102
- target_port = 9102
+ port = 9090
+ target_port = 9090
}
}
}
@@ -398,15 +381,38 @@ resource "kubernetes_service" "ratelimit_external" {
port = 6070
target_port = 6070
}
- dynamic "port" {
- for_each = var.enable_metrics ? [1] : []
- content {
- name = "metrics"
- port = 9102
- target_port = 9102
- }
- }
+
}
depends_on = [kubernetes_namespace.ratelimit_namespace]
}
+
+# Pod Monitoring
+resource "kubernetes_manifest" "ratelimit_pod_monitoring" {
+ manifest = {
+ apiVersion = "monitoring.googleapis.com/v1"
+ kind = "PodMonitoring"
+ metadata = {
+ name = "ratelimit-monitoring"
+ namespace = var.namespace
+ }
+ spec = {
+ selector = {
+ matchLabels = {
+ app = "ratelimit"
+ }
+ }
+ endpoints = [
+ {
+ port = "metrics"
+ path = "/metrics"
+ interval = "15s"
+ }
+ ]
+ }
+ }
+ depends_on = [
+ kubernetes_deployment.ratelimit,
+ time_sleep.wait_for_cluster
+ ]
+}
diff --git a/examples/terraform/envoy-ratelimiter/variables.tf
b/examples/terraform/envoy-ratelimiter/variables.tf
index b7a77114821..3d732372acf 100644
--- a/examples/terraform/envoy-ratelimiter/variables.tf
+++ b/examples/terraform/envoy-ratelimiter/variables.tf
@@ -183,7 +183,7 @@ variable "namespace" {
}
variable "enable_metrics" {
- description = "Whether to deploy the statsd-exporter sidecar for Prometheus
metrics"
+ description = "Enable metrics export to Google Cloud Monitoring"
type = bool
- default = false
+ default = true
}