monitoring-observability

Comprehensive monitoring setup with metrics collection, log aggregation, alerting, and health checks. Covers Prometheus for metrics instrumentation, Grafana dashboards for visualization, and structured logging with Winston for log aggregation Includes pre-built alert rules for high error rates, slow response times, pod failures, and resource saturation Provides advanced health check endpoints that test database, cache, and external API dependencies with latency tracking Implements the four golden signals framework: latency percentiles, request volume, error rates, and resource utilization

INSTALLATION
npx skills add https://github.com/supercent-io/skills-template --skill monitoring-observability
Run in your project or agent environment. Adjust flags if your CLI version differs.

SKILL.md

$2a

const app = express();

// Default metrics (CPU, Memory, etc.)

promClient.collectDefaultMetrics();

// Custom metrics

const httpRequestDuration = new promClient.Histogram({

name: 'http_request_duration_seconds',

help: 'Duration of HTTP requests in seconds',

labelNames: ['method', 'route', 'status_code']

});

const httpRequestTotal = new promClient.Counter({

name: 'http_requests_total',

help: 'Total number of HTTP requests',

labelNames: ['method', 'route', 'status_code']

});

// Middleware to track requests

app.use((req, res, next) => {

const start = Date.now();

res.on('finish', () => {

const duration = (Date.now() - start) / 1000;

const labels = {

method: req.method,

route: req.route?.path || req.path,

status_code: res.statusCode

};

httpRequestDuration.observe(labels, duration);

httpRequestTotal.inc(labels);

});

next();

});

// Metrics endpoint

app.get('/metrics', async (req, res) => {

res.set('Content-Type', promClient.register.contentType);

res.end(await promClient.register.metrics());

});

app.listen(3000);

**prometheus.yml**:

global:

scrape_interval: 15s

evaluation_interval: 15s

scrape_configs:

- job_name: 'my-app'

static_configs:

- targets: ['localhost:3000']

metrics_path: '/metrics'

- job_name: 'node-exporter'

static_configs:

- targets: ['localhost:9100']

alerting:

alertmanagers:

- static_configs:

- targets: ['localhost:9093']

rule_files:

- 'alert_rules.yml'


### Step 2: Alert Rules

**alert_rules.yml**:

groups:

- name: application_alerts

interval: 30s

rules:

# High error rate

- alert: HighErrorRate

expr: |

(

sum(rate(http_requests_total{status_code=~"5.."}[5m]))

/

sum(rate(http_requests_total[5m]))

) > 0.05

for: 5m

labels:

severity: critical

annotations:

summary: "High error rate detected"

description: "Error rate is {{ $value }}% (threshold: 5%)"

# Slow response time

- alert: SlowResponseTime

expr: |

histogram_quantile(0.95,

sum(rate(http_request_duration_seconds_bucket[5m])) by (le)

) > 1

for: 10m

labels:

severity: warning

annotations:

summary: "Slow response time"

description: "95th percentile is {{ $value }}s"

# Pod down

- alert: PodDown

expr: up{job="my-app"} == 0

for: 2m

labels:

severity: critical

annotations:

summary: "Pod is down"

description: "{{ $labels.instance }} has been down for more than 2 minutes"

# High memory usage

- alert: HighMemoryUsage

expr: |

(

node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes

) / node_memory_MemTotal_bytes > 0.90

for: 5m

labels:

severity: warning

annotations:

summary: "High memory usage"

description: "Memory usage is {{ $value }}%"


### Step 3: Log Aggregation (Structured Logging)

**Winston (Node.js)**:

import winston from 'winston';

const logger = winston.createLogger({

level: process.env.LOG_LEVEL || 'info',

format: winston.format.combine(

winston.format.timestamp(),

winston.format.errors({ stack: true }),

winston.format.json()

),

defaultMeta: {

service: 'my-app',

environment: process.env.NODE_ENV

},

transports: [

new winston.transports.Console({

format: winston.format.combine(

winston.format.colorize(),

winston.format.simple()

)

}),

new winston.transports.File({

filename: 'logs/error.log',

level: 'error'

}),

new winston.transports.File({

filename: 'logs/combined.log'

})

]

});

// Usage

logger.info('User logged in', { userId: '123', ip: '1.2.3.4' });

logger.error('Database connection failed', { error: err.message, stack: err.stack });

// Express middleware

app.use((req, res, next) => {

logger.info('HTTP Request', {

method: req.method,

path: req.path,

ip: req.ip,

userAgent: req.get('user-agent')

});

next();

});


### Step 4: Grafana Dashboard

**dashboard.json** (example):

{

"dashboard": {

"title": "Application Metrics",

"panels": [

{

"title": "Request Rate",

"type": "graph",

"targets": [

{

"expr": "rate(http_requests_total[5m])",

"legendFormat": "{{method}} {{route}}"

}

]

},

{

"title": "Error Rate",

"type": "graph",

"targets": [

{

"expr": "rate(http_requests_total{status_code=~\"5..\"}[5m])",

"legendFormat": "Errors"

}

]

},

{

"title": "Response Time (p95)",

"type": "graph",

"targets": [

{

"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))"

}

]

},

{

"title": "CPU Usage",

"type": "gauge",

"targets": [

{

"expr": "rate(process_cpu_seconds_total[5m]) * 100"

}

]

}

]

}

}


### Step 5: Health Checks

**Advanced Health Check**:

interface HealthStatus {

status: 'healthy' | 'degraded' | 'unhealthy';

timestamp: string;

uptime: number;

checks: {

database: { status: string; latency?: number; error?: string };

redis: { status: string; latency?: number };

externalApi: { status: string; latency?: number };

};

}

app.get('/health', async (req, res) => {

const startTime = Date.now();

const health: HealthStatus = {

status: 'healthy',

timestamp: new Date().toISOString(),

uptime: process.uptime(),

checks: {

database: { status: 'unknown' },

redis: { status: 'unknown' },

externalApi: { status: 'unknown' }

}

};

// Database check

try {

const dbStart = Date.now();

await db.raw('SELECT 1');

health.checks.database = {

status: 'healthy',

latency: Date.now() - dbStart

};

} catch (error) {

health.status = 'unhealthy';

health.checks.database = {

status: 'unhealthy',

error: error.message

};

}

// Redis check

try {

const redisStart = Date.now();

await redis.ping();

health.checks.redis = {

status: 'healthy',

latency: Date.now() - redisStart

};

} catch (error) {

health.status = 'degraded';

health.checks.redis = { status: 'unhealthy' };

}

const statusCode = health.status === 'healthy' ? 200 : health.status === 'degraded' ? 200 : 503;

res.status(statusCode).json(health);

});


## Output format

### Monitoring Dashboard Configuration

Golden Signals:

  1. Latency (Response Time)

- P50, P95, P99 percentiles

- Per API endpoint

  1. Traffic (Request Volume)

- Requests per second

- Per endpoint, per status code

  1. Errors (Error Rate)

- 5xx error rate

- 4xx error rate

- Per error type

  1. Saturation (Resource Utilization)

- CPU usage

- Memory usage

- Disk I/O

- Network bandwidth

BrowserAct

Let your agent run on any real-world website

Bypass CAPTCHA & anti-bot for free. Start local, scale to cloud.

Explore BrowserAct Skills →

Stop writing automation&scrapers

Install the CLI. Run your first Skill in 30 seconds. Scale when you're ready.

Start free
free · no credit card