Update Prometheus config, Add Jaeger

Neenad Ingole
1 parent 47d39618
Showing 8 changed files with 260 additions and 5 deletions Show diff stats
jaeger/docker-compose.yml
prometheus-grafana/alertmanager/config.yml
prometheus-grafana/docker-compose.yml
prometheus-grafana/grafana/config.monitoring
prometheus-grafana/grafana/provisioning/dashboards/dashboard.yml
prometheus-grafana/grafana/provisioning/datasources/datasource.yml
prometheus-grafana/prometheus/alert.rules
prometheus-grafana/prometheus/prometheus.yml
@@ -0,0 +1,19 @@
+version: '3.8'
+services:
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    ports:
+      - '6831:6831/udp'
+      - '16686:16686'
+  hotrod:
+    image: jaegertracing/example-hotrod:latest
+    ports:
+      - '8082:8080'
+    command: ['all']
+    environment:
+      - JAEGER_AGENT_HOST=jaeger
+      # Note: if your application is using Node.js Jaeger Client, you need port 6832,
+      #       unless issue https://github.com/jaegertracing/jaeger/issues/1596 is resolved.
+      - JAEGER_AGENT_PORT=6831
+    depends_on:
+      - jaeger
@@ -0,0 +1,10 @@
+route:
+  receiver: 'slack'
+
+receivers:
+  - name: 'slack'
+#       slack_configs:
+#           - send_resolved: true
+#             username: '<username>'
+#             channel: '#<channel-name>'
+#             api_url: '<incomming-webhook-url>'
 \ No newline at end of file
 version: '3.8'
  
+volumes:
+  prometheus_data: {}
+  grafana_data: {}
+
 services:
   prometheus:
     image: prom/prometheus
     restart: always
     volumes:
-      - ./prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./prometheus:/etc/prometheus/
+      - prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+      - '--web.console.templates=/usr/share/prometheus/consoles'
     ports:
       - 9090:9090
-  
+    links:
+      - cadvisor:cadvisor
+      - alertmanager:alertmanager
+    depends_on:
+      - cadvisor
+
+  node-exporter:
+    image: prom/node-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - --collector.filesystem.ignored-mount-points
+      - '^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)'
+    ports:
+      - 9100:9100
+    restart: always
+    deploy:
+      mode: global
+
+  alertmanager:
+    image: prom/alertmanager
+    restart: always
+    ports:
+      - 9093:9093
+    volumes:
+      - ./alertmanager/:/etc/alertmanager/
+    command:
+      - '--config.file=/etc/alertmanager/config.yml'
+      - '--storage.path=/alertmanager'
+
+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:rw
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+    ports:
+      - 8080:8080
+    restart: always
+    deploy:
+      mode: global
+
   grafana:
     image: grafana/grafana
-    environment:
-      GF_INSTALL_PLUGINS: "grafana-clock-panel,grafana-simple-json-datasource"
+    user: '472'
     restart: always
+    environment:
+      GF_INSTALL_PLUGINS: 'grafana-clock-panel,grafana-simple-json-datasource'
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - ./grafana/provisioning/:/etc/grafana/provisioning/
+    env_file:
+      - ./grafana/config.monitoring
     ports:
       - 3000:3000
     depends_on:
-      - prometheus
 \ No newline at end of file
+      - prometheus
@@ -0,0 +1,2 @@
+GF_SECURITY_ADMIN_PASSWORD=foobar
+GF_USERS_ALLOW_SIGN_UP=false
 \ No newline at end of file
@@ -0,0 +1,11 @@
+apiVersion: 1
+
+providers:
+- name: 'Prometheus'
+  orgId: 1
+  folder: ''
+  type: file
+  disableDeletion: false
+  editable: true
+  options:
+    path: /etc/grafana/provisioning/dashboards
 \ No newline at end of file
@@ -0,0 +1,50 @@
+# config file version
+apiVersion: 1
+
+# list of datasources that should be deleted from the database
+deleteDatasources:
+  - name: Prometheus
+    orgId: 1
+
+# list of datasources to insert/update depending
+# whats available in the database
+datasources:
+  # <string, required> name of the datasource. Required
+- name: Prometheus
+  # <string, required> datasource type. Required
+  type: prometheus
+  # <string, required> access mode. direct or proxy. Required
+  access: proxy
+  # <int> org id. will default to orgId 1 if not specified
+  orgId: 1
+  # <string> url
+  url: http://prometheus:9090
+  # <string> database password, if used
+  password:
+  # <string> database user, if used
+  user:
+  # <string> database name, if used
+  database:
+  # <bool> enable/disable basic auth
+  basicAuth: false
+  # <string> basic auth username, if used
+  basicAuthUser:
+  # <string> basic auth password, if used
+  basicAuthPassword:
+  # <bool> enable/disable with credentials headers
+  withCredentials:
+  # <bool> mark as default datasource. Max one per org
+  isDefault: true
+  # <map> fields that will be converted to json and stored in json_data
+  jsonData:
+     graphiteVersion: "1.1"
+     tlsAuth: false
+     tlsAuthWithCACert: false
+  # <string> json object of data that will be encrypted.
+  secureJsonData:
+    tlsCACert: "..."
+    tlsClientCert: "..."
+    tlsClientKey: "..."
+  version: 1
+  # <bool> allow users to edit datasources from the UI.
+  editable: true
 \ No newline at end of file
@@ -0,0 +1,22 @@
+groups:
+- name: example
+  rules:
+
+  # Alert for any instance that is unreachable for >2 minutes.
+  - alert: service_down
+    expr: up == 0
+    for: 2m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance {{ $labels.instance }} down"
+      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
+
+  - alert: high_load
+    expr: node_load1 > 0.5
+    for: 2m
+    labels:
+      severity: page
+    annotations:
+      summary: "Instance {{ $labels.instance }} under high load"
+      description: "{{ $labels.instance }} of job {{ $labels.job }} is under high load."
@@ -0,0 +1,79 @@
+# my global config
+global:
+  scrape_interval:     15s # By default, scrape targets every 15 seconds.
+  evaluation_interval: 15s # By default, scrape targets every 15 seconds.
+  # scrape_timeout is set to the global default (10s).
+
+  # Attach these labels to any time series or alerts when communicating with
+  # external systems (federation, remote storage, Alertmanager).
+  external_labels:
+      monitor: 'my-project'
+
+# Load and evaluate rules in this file every 'evaluation_interval' seconds.
+rule_files:
+  - 'alert.rules'
+  # - "first.rules"
+  # - "second.rules"
+
+# alert
+alerting:
+  alertmanagers:
+  - scheme: http
+    static_configs:
+    - targets:
+      - "alertmanager:9093"
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
+
+  - job_name: app
+    scrape_interval: 5s
+    static_configs:
+      - targets: ['host.docker.internal:8000']
+
+  - job_name: 'prometheus'
+
+    # Override the global default and scrape targets from this job every 5 seconds.
+    scrape_interval: 5s
+
+    static_configs:
+         - targets: ['localhost:9090']
+
+
+  - job_name: 'cadvisor'
+
+    # Override the global default and scrape targets from this job every 5 seconds.
+    scrape_interval: 5s
+
+    dns_sd_configs:
+    - names:
+      - 'tasks.cadvisor'
+      type: 'A'
+      port: 8080
+
+#     static_configs:
+#          - targets: ['cadvisor:8080']
+
+  - job_name: 'node-exporter'
+
+    # Override the global default and scrape targets from this job every 5 seconds.
+    scrape_interval: 5s
+
+    dns_sd_configs:
+    - names:
+      - 'tasks.node-exporter'
+      type: 'A'
+      port: 9100
+
+#  - job_name: 'pushgateway'
+#    scrape_interval: 10s
+#    dns_sd_configs:
+#    - names:
+#      - 'tasks.pushgateway'
+#      type: 'A'
+#      port: 9091
+
+#     static_configs:
+#          - targets: ['node-exporter:9100']
 \ No newline at end of file
...	...	@@ -0,0 +1,19 @@
	1	+version: '3.8'
	2	+services:
	3	+ jaeger:
	4	+ image: jaegertracing/all-in-one:latest
	5	+ ports:
	6	+ - '6831:6831/udp'
	7	+ - '16686:16686'
	8	+ hotrod:
	9	+ image: jaegertracing/example-hotrod:latest
	10	+ ports:
	11	+ - '8082:8080'
	12	+ command: ['all']
	13	+ environment:
	14	+ - JAEGER_AGENT_HOST=jaeger
	15	+ # Note: if your application is using Node.js Jaeger Client, you need port 6832,
	16	+ # unless issue https://github.com/jaegertracing/jaeger/issues/1596 is resolved.
	17	+ - JAEGER_AGENT_PORT=6831
	18	+ depends_on:
	19	+ - jaeger
...	...
...	...	@@ -0,0 +1,10 @@
	1	+route:
	2	+ receiver: 'slack'
	3	+
	4	+receivers:
	5	+ - name: 'slack'
	6	+# slack_configs:
	7	+# - send_resolved: true
	8	+# username: '<username>'
	9	+# channel: '#<channel-name>'
	10	+# api_url: '<incomming-webhook-url>'
0	11	\ No newline at end of file
...	...
1	1	version: '3.8'
2	2
	3	+volumes:
	4	+ prometheus_data: {}
	5	+ grafana_data: {}
	6	+
3	7	services:
4	8	prometheus:
5	9	image: prom/prometheus
6	10	restart: always
7	11	volumes:
8		- - ./prometheus.yml:/etc/prometheus/prometheus.yml
	12	+ - ./prometheus:/etc/prometheus/
	13	+ - prometheus_data:/prometheus
	14	+ command:
	15	+ - '--config.file=/etc/prometheus/prometheus.yml'
	16	+ - '--storage.tsdb.path=/prometheus'
	17	+ - '--web.console.libraries=/usr/share/prometheus/console_libraries'
	18	+ - '--web.console.templates=/usr/share/prometheus/consoles'
9	19	ports:
10	20	- 9090:9090
11		-
	21	+ links:
	22	+ - cadvisor:cadvisor
	23	+ - alertmanager:alertmanager
	24	+ depends_on:
	25	+ - cadvisor
	26	+
	27	+ node-exporter:
	28	+ image: prom/node-exporter
	29	+ volumes:
	30	+ - /proc:/host/proc:ro
	31	+ - /sys:/host/sys:ro
	32	+ - /:/rootfs:ro
	33	+ command:
	34	+ - '--path.procfs=/host/proc'
	35	+ - '--path.sysfs=/host/sys'
	36	+ - --collector.filesystem.ignored-mount-points
	37	+ - '^/(sys\|proc\|dev\|host\|etc\|rootfs/var/lib/docker/containers\|rootfs/var/lib/docker/overlay2\|rootfs/run/docker/netns\|rootfs/var/lib/docker/aufs)($$\|/)'
	38	+ ports:
	39	+ - 9100:9100
	40	+ restart: always
	41	+ deploy:
	42	+ mode: global
	43	+
	44	+ alertmanager:
	45	+ image: prom/alertmanager
	46	+ restart: always
	47	+ ports:
	48	+ - 9093:9093
	49	+ volumes:
	50	+ - ./alertmanager/:/etc/alertmanager/
	51	+ command:
	52	+ - '--config.file=/etc/alertmanager/config.yml'
	53	+ - '--storage.path=/alertmanager'
	54	+
	55	+ cadvisor:
	56	+ image: gcr.io/cadvisor/cadvisor
	57	+ volumes:
	58	+ - /:/rootfs:ro
	59	+ - /var/run:/var/run:rw
	60	+ - /sys:/sys:ro
	61	+ - /var/lib/docker/:/var/lib/docker:ro
	62	+ ports:
	63	+ - 8080:8080
	64	+ restart: always
	65	+ deploy:
	66	+ mode: global
	67	+
12	68	grafana:
13	69	image: grafana/grafana
14		- environment:
15		- GF_INSTALL_PLUGINS: "grafana-clock-panel,grafana-simple-json-datasource"
	70	+ user: '472'
16	71	restart: always
	72	+ environment:
	73	+ GF_INSTALL_PLUGINS: 'grafana-clock-panel,grafana-simple-json-datasource'
	74	+ volumes:
	75	+ - grafana_data:/var/lib/grafana
	76	+ - ./grafana/provisioning/:/etc/grafana/provisioning/
	77	+ env_file:
	78	+ - ./grafana/config.monitoring
17	79	ports:
18	80	- 3000:3000
19	81	depends_on:
20		- - prometheus
21	82	\ No newline at end of file
	83	+ - prometheus
...	...
...	...	@@ -0,0 +1,2 @@
	1	+GF_SECURITY_ADMIN_PASSWORD=foobar
	2	+GF_USERS_ALLOW_SIGN_UP=false
0	3	\ No newline at end of file
...	...
...	...	@@ -0,0 +1,11 @@
	1	+apiVersion: 1
	2	+
	3	+providers:
	4	+- name: 'Prometheus'
	5	+ orgId: 1
	6	+ folder: ''
	7	+ type: file
	8	+ disableDeletion: false
	9	+ editable: true
	10	+ options:
	11	+ path: /etc/grafana/provisioning/dashboards
0	12	\ No newline at end of file
...	...
...	...	@@ -0,0 +1,50 @@
	1	+# config file version
	2	+apiVersion: 1
	3	+
	4	+# list of datasources that should be deleted from the database
	5	+deleteDatasources:
	6	+ - name: Prometheus
	7	+ orgId: 1
	8	+
	9	+# list of datasources to insert/update depending
	10	+# whats available in the database
	11	+datasources:
	12	+ # <string, required> name of the datasource. Required
	13	+- name: Prometheus
	14	+ # <string, required> datasource type. Required
	15	+ type: prometheus
	16	+ # <string, required> access mode. direct or proxy. Required
	17	+ access: proxy
	18	+ # <int> org id. will default to orgId 1 if not specified
	19	+ orgId: 1
	20	+ # <string> url
	21	+ url: http://prometheus:9090
	22	+ # <string> database password, if used
	23	+ password:
	24	+ # <string> database user, if used
	25	+ user:
	26	+ # <string> database name, if used
	27	+ database:
	28	+ # <bool> enable/disable basic auth
	29	+ basicAuth: false
	30	+ # <string> basic auth username, if used
	31	+ basicAuthUser:
	32	+ # <string> basic auth password, if used
	33	+ basicAuthPassword:
	34	+ # <bool> enable/disable with credentials headers
	35	+ withCredentials:
	36	+ # <bool> mark as default datasource. Max one per org
	37	+ isDefault: true
	38	+ # <map> fields that will be converted to json and stored in json_data
	39	+ jsonData:
	40	+ graphiteVersion: "1.1"
	41	+ tlsAuth: false
	42	+ tlsAuthWithCACert: false
	43	+ # <string> json object of data that will be encrypted.
	44	+ secureJsonData:
	45	+ tlsCACert: "..."
	46	+ tlsClientCert: "..."
	47	+ tlsClientKey: "..."
	48	+ version: 1
	49	+ # <bool> allow users to edit datasources from the UI.
	50	+ editable: true
0	51	\ No newline at end of file
...	...
...	...	@@ -0,0 +1,22 @@
	1	+groups:
	2	+- name: example
	3	+ rules:
	4	+
	5	+ # Alert for any instance that is unreachable for >2 minutes.
	6	+ - alert: service_down
	7	+ expr: up == 0
	8	+ for: 2m
	9	+ labels:
	10	+ severity: page
	11	+ annotations:
	12	+ summary: "Instance {{ $labels.instance }} down"
	13	+ description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
	14	+
	15	+ - alert: high_load
	16	+ expr: node_load1 > 0.5
	17	+ for: 2m
	18	+ labels:
	19	+ severity: page
	20	+ annotations:
	21	+ summary: "Instance {{ $labels.instance }} under high load"
	22	+ description: "{{ $labels.instance }} of job {{ $labels.job }} is under high load."
...	...
...	...	@@ -0,0 +1,79 @@
	1	+# my global config
	2	+global:
	3	+ scrape_interval: 15s # By default, scrape targets every 15 seconds.
	4	+ evaluation_interval: 15s # By default, scrape targets every 15 seconds.
	5	+ # scrape_timeout is set to the global default (10s).
	6	+
	7	+ # Attach these labels to any time series or alerts when communicating with
	8	+ # external systems (federation, remote storage, Alertmanager).
	9	+ external_labels:
	10	+ monitor: 'my-project'
	11	+
	12	+# Load and evaluate rules in this file every 'evaluation_interval' seconds.
	13	+rule_files:
	14	+ - 'alert.rules'
	15	+ # - "first.rules"
	16	+ # - "second.rules"
	17	+
	18	+# alert
	19	+alerting:
	20	+ alertmanagers:
	21	+ - scheme: http
	22	+ static_configs:
	23	+ - targets:
	24	+ - "alertmanager:9093"
	25	+
	26	+# A scrape configuration containing exactly one endpoint to scrape:
	27	+# Here it's Prometheus itself.
	28	+scrape_configs:
	29	+ # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
	30	+
	31	+ - job_name: app
	32	+ scrape_interval: 5s
	33	+ static_configs:
	34	+ - targets: ['host.docker.internal:8000']
	35	+
	36	+ - job_name: 'prometheus'
	37	+
	38	+ # Override the global default and scrape targets from this job every 5 seconds.
	39	+ scrape_interval: 5s
	40	+
	41	+ static_configs:
	42	+ - targets: ['localhost:9090']
	43	+
	44	+
	45	+ - job_name: 'cadvisor'
	46	+
	47	+ # Override the global default and scrape targets from this job every 5 seconds.
	48	+ scrape_interval: 5s
	49	+
	50	+ dns_sd_configs:
	51	+ - names:
	52	+ - 'tasks.cadvisor'
	53	+ type: 'A'
	54	+ port: 8080
	55	+
	56	+# static_configs:
	57	+# - targets: ['cadvisor:8080']
	58	+
	59	+ - job_name: 'node-exporter'
	60	+
	61	+ # Override the global default and scrape targets from this job every 5 seconds.
	62	+ scrape_interval: 5s
	63	+
	64	+ dns_sd_configs:
	65	+ - names:
	66	+ - 'tasks.node-exporter'
	67	+ type: 'A'
	68	+ port: 9100
	69	+
	70	+# - job_name: 'pushgateway'
	71	+# scrape_interval: 10s
	72	+# dns_sd_configs:
	73	+# - names:
	74	+# - 'tasks.pushgateway'
	75	+# type: 'A'
	76	+# port: 9091
	77	+
	78	+# static_configs:
	79	+# - targets: ['node-exporter:9100']
0	80	\ No newline at end of file
...	...