Prometheus with Alert Manager hosted in Fargate.

Creating the Docker images

FROM prom/alertmanager:latestADD alertmanager.yml /etc/alertmanager/
global:
resolve_timeout: 5m
route:
group_by: ['instance','severity']
routes:
- match:
alertname: ExporterDown
receiver: 'pushover'
receivers:
- name: 'pushover'
FROM prom/prometheus:v2.14.0ADD prometheus.yml /etc/prometheus/
ADD irst_rules.yml /etc/prometheus/
ADD targets/*.yml /etc/prometheus/targets/*.yml
global:
scrape_interval: 5s
evaluation_interval: 10s
rule_files:
- "first_rules.yml"
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['localhost:9090']
- job_name: node-exporter'
ec2_sd_configs:
- port: 9100
region: 'us-east-1'
profile: '<enter profile name>'
relabel_configs:
- source_labels: [__meta_ec2_tag_service_name]
action: keep
regex: '<enter regex string>'
- source_labels: [__meta_ec2_tag_name]
target_label: instance
- source_labels: [__meta_ec2_instance_id]
target_label: instance_id
- job_name: ecs-exporter
static_configs:
- targets: ['localhost:9222']
- job_name: 'jmx-exporter'
ec2_sd_configs:
- port: 9404
region: 'us-east-1'
profile: '<enter profile name>'
relabel_configs:
- source_labels: [__meta_ec2_tag_service_name]
action: keep
regex: '<enter regex string>'
- source_labels: [__meta_ec2_tag_name]
target_label: name
- source_labels: [__meta_ec2_instance_id]
target_label: instance_id
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
groups:
- name: alert
rules:
- alert: PrometheusNotConnectedToAlertManager
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 5m
labels:
severity: error
annotations:
summary: "Prometheus is not connected to an alertmanager (instance {{ $labels.instance }})"
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ExporterDown
expr: up{job="node"} == 0
for: 3m
labels:
severity: warning
annotations:
summary: "Exporter down ( instance {{ $labels.instance }})"
description: "Prometheus exporter down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: OutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Out of memory ( instance {{ $labels.instance }})"
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: OutOfDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Out of disk space (instance {{ $labels.instance }})"
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: HighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"

Building and pushing to Elastic Container Registry

Prometheus
Docker
alertmanager
prometheus
aws-runas <name of account> aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin <accountID>.dkr.ecr.us-east-1.amazonaws.com
docker build -t alertmanager .
docker tag alertmanager:latest <account>.dkr.ecr.<region>.amazonaws.com/alertmanager:latest
docker push <account>.dkr.ecr.<region>.amazonaws.com/alertmanager:latest
docker build -t prometheus .
docker tag prometheus:latest <account>.dkr.ecr.<region>.amazonaws.com/prometheus:latest
docker push <account>.dkr.ecr.<region>.amazonaws.com/prometheus:latest

Creating the ECS module

/*==== IAM roles =====*/resource "aws_iam_role" "ecs-service-role" {
name = "${var.tags["environment"]}-ecs-service-role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "ecs.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role" "ecs-instance-role" {
name = "${var.tags["environment"]}-ecs-instance-role"
path = "/"
assume_role_policy = "${data.aws_iam_policy_document.ecs-instance-policy.json}"
}
data "aws_iam_policy_document" "ecs-instance-policy" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["ec2.amazonaws.com"]
}
}
}
resource "aws_iam_role_policy_attachment" "ecs-instance-role-attachment" {
role = "${aws_iam_role.ecs-instance-role.name}"
policy_arn = arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"
}
resource "aws_iam_instance_profile" "ecs-instance-profile" {
name = "${var.env["environment"]}-ecs-instance-profile"
path = "/"
role = "${aws_iam_role.ecs-instance-role.id}"
provisioner "local-exec" {
command = "sleep 10"
}
}
/*==== Create IAM Task Definition Role =====*/data "aws_iam_policy_document" "ecs-service-policy" {
statement {
actions = ["sts:AssumeRole"}
principals {
type = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
}
}
}
resource "aws_iam_policy" "policy" {
name = "${var.env["environment"]}-ecs-exporter"
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Action": [
"ecs:ListServices",
"ecs:ListContainerInstances",
"ecs:ListClusters",
"ecs:DescribeServices",
"ecs:DescribeContainerInstances",
"ecs:DescribeClusters"
],
"Resource": "*"
}
]
}
EOF
}
resource "aws_iam_role" "task-definition-role" {
name = "${var.env{"environment"]}-task-definition"
assume_role_policy = "${data.aws_iam_policy_document.ecs-service-policy.json}"
}
resource "aws_iam_role_policy_attachment" "ecs-exporter" {
role = "${aws_iam_role.task-definition-role.name}"
policy_arn = "${aws_iam_policy.policy.arn}"
}
resource "aws_iam_role_policy_attachment" "ecs-service-role-attachment" {
role = "${aws_iam_role.task-definition-role.name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
resource "aws_iam_role_policy_attachment" "prometheus-read-access" {
role = "${aws_iam_role.task-definition-role.name}"
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ReadOnlyAccess"
}
/*==== ECS Fargate Cluster =====*/resource "aws_ecs_cluster" "main" {
name = "${var.env["environment"]}"
}
/*==== CloudWatch =====*/resource "aws_cloudwatch_log_group" "logs"
name = "/${var.env["environment"]}/prometheus"
retention_in_days = "7"
tags = "${merge(map("Name",format("%s-cloudwatch",var.tags["environment"])),var.tags)}"
}
resource "aws_cloudwatch_log_group" "alertmanager" {
name = "/${var.env["environment"]}/alertmanager"
retention_in_days = "7"
tags = "${merge(map("Name",format("%s-cloudwatch",var.tags["environment"])),var.tags)}"
}
output "ecs_cluster" {
value = "${aws_ecs_cluster.main.name}"
}
output "cluster_id" {
value = "${aws_ecs_cluster.main.id}"
}
output "task_definition_role_arn" {
value = "${aws_iam_role.task-definition-role.arn}"
}
variable "env" {type = "map"}
variable "tags" {type = "map"}
variable "vpc_id" {}

Creating the Security Group module

resource "aws_security_group" "alb-security-group" {
name = "${var.env["environment"]}-alb"
description = "ALB security group for environment ${var.tags["environment"]}"
vpc_id = "${var.vpc_id}"
ingress {
from_port = 9090
to_port = 9090
protocol = "TCP"
cidr_blocks = ["<enter your cidr block>"]
description = "Prometheus access"
}
ingress {
from_port = 9093
to_port = 9093
protocol = "TCP"
cidr_blocks = ["<enter your cidr block>"]
description = "AlertManager access"
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
lifecycle {
create_before_destroy = true
}
tags = "${merge(map("Name",format("%s-alb",var.tags["environment"])),var.tags,
map("service-name",format("%s-security-group",var.tags["environment"])),
map("service-type","security-group"))}"
}
resource "aws_security_group" "worker-security-group" {
name = "${var.env["environment"]}-worker"
description = "Worker security group for environment ${var.tags["environment"]}"
vpc_id = "${var.vpc_id}"
ingress {
from_port = 9090
to_port = 9090
protocol = "TCP"
security_groups = ["${aws_security_group.alb-security-group.arn}"]
description = "Load balancer to Prometheus"
}
ingress {
from_port = 9093
to_port = 9093
protocol = "TCP"
security_groups = ["${aws_security_group.alb-security-group.arn}"]
description = "Load balancer to AlertManager"
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
lifecycle {
create_before_destroy = true
}
tags = "${merge(map("Name",format("%s-worker",var.tags["environment"])),var.tags,
map("service-name",format("%s-security-group",var.tags["environment"])),
map("service-type","security-group"))}"
}
output "alb-security-group" { value = "${aws_security_group.alb-security-group.name}" }output "alb-security-group-id" { value = "${aws_security_group.alb-security-group.id}" }output "worker-security-group" { value = "${aws_security_group.worker-security-group.name}" }output "worker-security-group-id { value = "${aws_security_group.worker-security-group.id}" }
variable "vpc_id" {}variable "tags" { type = "map"}variable "env" { type = "map"}

Creating the application module

prometheus
docker
alertmanager
prometheus
modules
ecs
security-groups
app
templates
data "aws_subnet_ids" "private" {
vpc_id = "${var.vpc_id}"
tags = {
network = "private"
}
}
data "aws+subnet_ids" "public" {
vpc_id = "${var.vpc_id}"
tags = {
network = "public"
}
}
/*==== Create ECS Task Definition =====*/data "template_file" "container_definition" {
template = "${file("${path.module}/templates/task_definition.json")}"
vars = {
app_name = local.app.app_name
name = "${var.env["environment"]}-${local.app.app_name}"
prom_image = local.app.prom_image
alrt_image = local.app.alrt_image
app_cpu = local.app.app_cpu
app_memory = local.app.app_memory
awslogs-group = "${var.env["environment"]}"
awslogs-region = "${var.env["region"]}"
}
}
resource "aws_ecs_task_definition" "app" {
family = "${var.env["environment"]}-${var.app["app_name"]}"
network_mode = "${var.network_mode}"
requires_compatibilities = ["FARGATE"]
cpu = "${var.app["fargate_cpu"]}"
memory = "${var.app["fargate_memory"]}"
execution_role_arn = "${var.task-definition-role-arn}"
task_role_arn = "${var.task-definition-role-arn}"
container_definitions = "${data.template_file.container_definition.rendered}"
}
/*==== Create ECS Service =====*/resource "aws_ecs_service" "main" {
name = "${var.env["environment"]}-${var.app["app_name"]}"
cluster = "${var.ecs_cluster}"
task_definition = "${aws_ecs_task_definition.app.family}:${aws_ecs_task_definition.app.revision}"
desired_count = "${var.app_count}"
launch_type = "FARGATE"
network_configuration {
security_groups = ["$var.worker-security-group-id}"]
subnets = "${data.aws_subnet_ids.private.ids}"
}
load_balancer {
target_group_arn = "${aws_alb_target_group.server.arn}"
container_name = "${var.env["environment"]}-${var.app["app_name"]}"
container_port = 9090
}
load_balancer {
target_group_arn = "${aws_alb_target_group.alertmanager.arn}"
container_name = "alertmanager"
container_port = 9093
}
depends_on = [
"aws_alb_listener.server",
]
}
/*==== Create Route53 record =====*/data "aws_route53_zone" "hosted_zone" {
name = "${var.hosted_zone}"
}
resource "aws_route53_record" "frontend_alb_r53" {
name = "${var.app["route53_name"]}"
depends_on = ["aws_alb.main"]
zone_id = "${data.aws_route53_zone.hosted_zone.zone_id}"
type = "A"
alias {
name = "${aws_alb.main.dns_name}"
zone_id = "${aws_alb.main.zone_id}"
evaluate_target_health = true
}
lifecycle {
create_before_destroy = true
}
}
/*==== Create Application Load Balancer =====*/resource "aws_alb" "main" {
name = "${var.env["environment"]}-${var.app["app_name"]}"
load_balancer_type = "application"
subnets = "${data.aws_subnet_ids.public.ids}"
security_groups = ["$var.alb-security-group-id}"]
tags = "${merge(map("Name", format("%s-${var.app["app_name"]}-app-alb", var.tags["environment"])),var.tags)}"
}
resource "aws_alb_target_group" "server" {
name = "${var.env["environment"]}-${var.app["app_name"]}"
port = "${var.app["app_port"]}"
protocol = "HTTP"
target_type = "ip"
vpc_id = "${var.vpc_id}"
health_check {
matcher = "200,302"
}
}
resource "aws_alb_tagret_group" "alertmanager" {
name = "${var.env["environment"]}-alrtmgr"
port = 9093
protocol = "HTTP"
target_type = "ip"
vpc_id = "${var.vpc_id}"
health_check {
matcher = "200,302"
}
}
resource "aws_alb_listener" "server" {
load_balancer_arn = "${aws_alb.main.id}"
port = "${var.app["app_port"]}"
protocol = "HTTP"
default_action {
target_group_arn = "${aws_alb_target_group.server.id}"
type = "forward"
}
}
resource "aws_alb_listener" "node" {
load_balancer_arn = "${aws_alb.main.id}"
port = 9100
protocol = "HTTP"
default_action {
target_group_arn = "${aws_alb_target_group.server.id}"
type = "forward"
}
}
resource "aws_alb_listener" "alertmanager" {
load_balancer_arn = "${aws_alb.main.id}"
port = 9093
protocol = "HTTP"
default_action {
target_group_arn = "${aws_alb_target_group.alertmanager.id}"
type = "forward"
}
}
variable "env" { type = "map"}
variable "tags" { type = "map"}
variable "vpc_id {}
variable "app" { type = "map" }
variable "app_port" {}
variable "ecs_cluster" {}
variable "app_count" {}variable "worker-security-group-id" {}
variable "alb-security-group-id {}
variable "task-definition-role-arn {}variable "fargate_cpu" {}
variable "fargate_memory" {}
variable "network_mode" {}
variable "hosted_zone" {}
locals {
defaults = {
ecr = ""
route53 = ""
prom_image = "<account>.dkr.ecr.<region>.amazonaws.com/prometheus:latest"
alrt_image = "<account>.dkr.ecr.<region>.amazonaws.com/alertmanager:latest"
app_port = 9090
app_cpu = 1024
app_memory = 2056
hosted_zone = "<enter hosted zone name>"
network_mode = "awsvpc"
app_name = "server"
}
app = merge(
local.defaults,
var.app
)
}
[
{
"essential": true,
"cpu": 128,
"image": "${prom_image}",
"memory": 128,
"name": "${name}"
"networkMode": "awsvpc",
"portMappings": [
{
"containerPort": 9090,
"hostPort": 9090,
"protocol": "tcp"
}
],
"logConfiguration": {
"logDriver": "awslogs",
"options: {
"awslogs-group": "/${awslogs-group}/prometheus",
"awslogs-region": "${awslogs-region}",
"awslogs-stream-prefix": "prometheus"
}
}
],
{
"essential": true,
"cpu": 128,
"image": "${alrt_image}"
"memory": 64,
"name": "alertmanager",
"networkMode": "awsvpc",
"portMappings": [
{
"containerPort": 9093,
"hostPort": 9093,
"protocol": "tcp"
}
],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": "/${awslogs-group}/alertmanager",
"awslogs-region": "${awslogs-region}",
"awslogs-stream-prefix": "alertmanager"
}
}
],
{
"essential": false,
"cpu": 10,
"image": "coveo/ecs-exporter",
"memory": 64,
"name": "ecs-exporter",
"networkMode": "awsvpc",
"command": ["-aws.region=<enter region>"],
"portMappings": [
{
"containerport": 9222,
"hostPort": 9222,
"protocol": "tcp"
}
]
}
]

Bring the build together

prometheus
infrastructure
docker
alertmanager
prometheus
modules
ecs
security-groups
app
templates
module "security-groups" {
source = "../modules/security-groups"
env = "${var.env}"
tags = "${var.tags}"
vpc_id = "${var.vpc_id}"
}
module "ecs" {
source = "../modules/ECS"
env = "${var.env}"
tags = "${var.tags}"
vpc_id = "${var.vpc_id}"
}
module "prometheus" {
source = "../modules/app"
app = "${var.prometheus}"
env = "${var.env}"
tags = "${var.tags}"
vpc_id = "${var.vpc_id}"
fargate_cpu = "${var.prometheus["fargate_cpu"]}"
fargate_memory = "${var.prometheus["fargate_memory"]}"
app_count = "${var.prometheus["app_count"]}"
app_port = "${var.prometheus["app_port"]}"
network_mode = "${var.prometheus["network_mode"]}"
hosted_zone = "${var.prometheus["hosted_zone"]}"
ecs_cluster = "${module.ecs.ecs_cluster}"
task-definition-role-arn = "${module.ecs.rask_definition_role_arn}"
alb-security-group-id = "${module.security-groups.alb-security-group-id}"
worker-security-group-id = "${module.security_groups.worker-security-group-id}"
}
variable "env" {
type = map(string)
default = {
environment = "prometheus-test"
app = "prometheus"
region = "<enter region>"
}
}
variable "tags" {
type = map(string)
default = {
name = "prometheus-test"
terraform = "0.12.9"
description = "Prometheus - build by me"
}
}
variable "prometheus" {
type = map(string)
default = {
prom_image = "<account>.dkr.ecr.<region>.amazonaws.com/prometheus:latest"
alrt_image = "<account>.dkr.ecr.<region>.amazonaws.com/alertmanager:latest"
app_name = "server"
route53_name = "prometheus"
app_port = 9090
fargate_cpu = 1024
fargate_memory = 2048
app_count = 1
hosted_zone = "<hosted_zone name>"
}
}
variable "vpc_id" { default = "<VPC>" }

Final thoughts

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store