SKILL.md

Distributed Tracing

Implement distributed tracing with Jaeger and Tempo for request flow visibility across microservices.

Purpose

Track requests across distributed systems to understand latency, dependencies, and failure points.

When to Use

Debug latency issues

Understand service dependencies

Identify bottlenecks

Trace error propagation

Analyze request paths

Distributed Tracing Concepts

Trace Structure

Trace (Request ID: abc123)

  ↓

Span (frontend) [100ms]

  ↓

Span (api-gateway) [80ms]

  ├→ Span (auth-service) [10ms]

  └→ Span (user-service) [60ms]

      └→ Span (database) [40ms]

Key Components

Trace - End-to-end request journey

Span - Single operation within a trace

Context - Metadata propagated between services

Tags - Key-value pairs for filtering

Logs - Timestamped events within a span

Jaeger Setup

Kubernetes Deployment

# Deploy Jaeger Operator

kubectl create namespace observability

kubectl create -f https://github.com/jaegertracing/jaeger-operator/releases/download/v1.51.0/jaeger-operator.yaml -n observability

# Deploy Jaeger instance

kubectl apply -f - <<EOF

apiVersion: jaegertracing.io/v1

kind: Jaeger

metadata:

  name: jaeger

  namespace: observability

spec:

  strategy: production

  storage:

    type: elasticsearch

    options:

      es:

        server-urls: http://elasticsearch:9200

  ingress:

    enabled: true

EOF

Docker Compose

version: "3.8"

services:

  jaeger:

    image: jaegertracing/all-in-one:1.62

    ports:

      - "5775:5775/udp"

      - "6831:6831/udp"

      - "6832:6832/udp"

      - "5778:5778"

      - "16686:16686" # UI

      - "14268:14268" # Collector

      - "14250:14250" # gRPC

      - "9411:9411" # Zipkin

    environment:

      - COLLECTOR_ZIPKIN_HOST_PORT=:9411

Reference: See references/jaeger-setup.md

Application Instrumentation

OpenTelemetry (Recommended)

#### Python (Flask)

from opentelemetry import trace

from opentelemetry.exporter.jaeger.thrift import JaegerExporter

from opentelemetry.sdk.resources import SERVICE_NAME, Resource

from opentelemetry.sdk.trace import TracerProvider

from opentelemetry.sdk.trace.export import BatchSpanProcessor

from opentelemetry.instrumentation.flask import FlaskInstrumentor

from flask import Flask

# Initialize tracer

resource = Resource(attributes={SERVICE_NAME: "my-service"})

provider = TracerProvider(resource=resource)

processor = BatchSpanProcessor(JaegerExporter(

    agent_host_name="jaeger",

    agent_port=6831,

))

provider.add_span_processor(processor)

trace.set_tracer_provider(provider)

# Instrument Flask

app = Flask(__name__)

FlaskInstrumentor().instrument_app(app)

@app.route('/api/users')

def get_users():

    tracer = trace.get_tracer(__name__)

    with tracer.start_as_current_span("get_users") as span:

        span.set_attribute("user.count", 100)

        # Business logic

        users = fetch_users_from_db()

        return {"users": users}

def fetch_users_from_db():

    tracer = trace.get_tracer(__name__)

    with tracer.start_as_current_span("database_query") as span:

        span.set_attribute("db.system", "postgresql")

        span.set_attribute("db.statement", "SELECT * FROM users")

        # Database query

        return query_database()

#### Node.js (Express)

const { NodeTracerProvider } = require("@opentelemetry/sdk-trace-node");

const { JaegerExporter } = require("@opentelemetry/exporter-jaeger");

const { BatchSpanProcessor } = require("@opentelemetry/sdk-trace-base");

const { registerInstrumentations } = require("@opentelemetry/instrumentation");

const { HttpInstrumentation } = require("@opentelemetry/instrumentation-http");

const {

  ExpressInstrumentation,

} = require("@opentelemetry/instrumentation-express");

// Initialize tracer

const provider = new NodeTracerProvider({

  resource: { attributes: { "service.name": "my-service" } },

});

const exporter = new JaegerExporter({

  endpoint: "http://jaeger:14268/api/traces",

});

provider.addSpanProcessor(new BatchSpanProcessor(exporter));

provider.register();

// Instrument libraries

registerInstrumentations({

  instrumentations: [new HttpInstrumentation(), new ExpressInstrumentation()],

});

const express = require("express");

const app = express();

app.get("/api/users", async (req, res) => {

  const tracer = trace.getTracer("my-service");

  const span = tracer.startSpan("get_users");

  try {

    const users = await fetchUsers();

    span.setAttributes({ "user.count": users.length });

    res.json({ users });

  } finally {

    span.end();

  }

});

#### Go

package main

import (

    "context"

    "go.opentelemetry.io/otel"

    "go.opentelemetry.io/otel/exporters/jaeger"

    "go.opentelemetry.io/otel/sdk/resource"

    sdktrace "go.opentelemetry.io/otel/sdk/trace"

    semconv "go.opentelemetry.io/otel/semconv/v1.4.0"

)

func initTracer() (*sdktrace.TracerProvider, error) {

    exporter, err := jaeger.New(jaeger.WithCollectorEndpoint(

        jaeger.WithEndpoint("http://jaeger:14268/api/traces"),

    ))

    if err != nil {

        return nil, err

    }

    tp := sdktrace.NewTracerProvider(

        sdktrace.WithBatcher(exporter),

        sdktrace.WithResource(resource.NewWithAttributes(

            semconv.SchemaURL,

            semconv.ServiceNameKey.String("my-service"),

        )),

    )

    otel.SetTracerProvider(tp)

    return tp, nil

}

func getUsers(ctx context.Context) ([]User, error) {

    tracer := otel.Tracer("my-service")

    ctx, span := tracer.Start(ctx, "get_users")

    defer span.End()

    span.SetAttributes(attribute.String("user.filter", "active"))

    users, err := fetchUsersFromDB(ctx)

    if err != nil {

        span.RecordError(err)

        return nil, err

    }

    span.SetAttributes(attribute.Int("user.count", len(users)))

    return users, nil

}

Reference: See references/instrumentation.md

Context Propagation

HTTP Headers

traceparent: 00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01

tracestate: congo=t61rcWkgMzE

Propagation in HTTP Requests

#### Python

from opentelemetry.propagate import inject

headers = {}

inject(headers)  # Injects trace context

response = requests.get('http://downstream-service/api', headers=headers)

#### Node.js

const { propagation } = require("@opentelemetry/api");

const headers = {};

propagation.inject(context.active(), headers);

axios.get("http://downstream-service/api", { headers });

Tempo Setup (Grafana)

Kubernetes Deployment

apiVersion: v1

kind: ConfigMap

metadata:

  name: tempo-config

data:

  tempo.yaml: |

    server:

      http_listen_port: 3200

    distributor:

      receivers:

        jaeger:

          protocols:

            thrift_http:

            grpc:

        otlp:

          protocols:

            http:

            grpc:

    storage:

      trace:

        backend: s3

        s3:

          bucket: tempo-traces

          endpoint: s3.amazonaws.com

    querier:

      frontend_worker:

        frontend_address: tempo-query-frontend:9095

---

apiVersion: apps/v1

kind: Deployment

metadata:

  name: tempo

spec:

  replicas: 1

  template:

    spec:

      containers:

        - name: tempo

          image: grafana/tempo:2.7

          args:

            - -config.file=/etc/tempo/tempo.yaml

          volumeMounts:

            - name: config

              mountPath: /etc/tempo

      volumes:

        - name: config

          configMap:

            name: tempo-config

Reference: See assets/jaeger-config.yaml.template

Sampling Strategies

Probabilistic Sampling

# Sample 1% of traces

sampler:

  type: probabilistic

  param: 0.01

Rate Limiting Sampling

# Sample max 100 traces per second

sampler:

  type: ratelimiting

  param: 100

Adaptive Sampling

from opentelemetry.sdk.trace.sampling import ParentBased, TraceIdRatioBased

# Sample based on trace ID (deterministic)

sampler = ParentBased(root=TraceIdRatioBased(0.01))

Trace Analysis

Finding Slow Requests

Jaeger Query:

service=my-service

duration > 1s

Finding Errors

Jaeger Query:

service=my-service

error=true

tags.http.status_code >= 500

Service Dependency Graph

Jaeger automatically generates service dependency graphs showing:

Service relationships

Request rates

Error rates

Average latencies

Best Practices

Sample appropriately (1-10% in production)

Add meaningful tags (user_id, request_id)

Propagate context across all service boundaries

Log exceptions in spans

Use consistent naming for operations

Monitor tracing overhead (<1% CPU impact)

Set up alerts for trace errors

Implement distributed context (baggage)

Use span events for important milestones

Document instrumentation standards

Integration with Logging

Correlated Logs

import logging

from opentelemetry import trace

logger = logging.getLogger(__name__)

def process_request():

    span = trace.get_current_span()

    trace_id = span.get_span_context().trace_id

    logger.info(

        "Processing request",

        extra={"trace_id": format(trace_id, '032x')}

    )

Troubleshooting

No traces appearing:

Check collector endpoint

Verify network connectivity

Check sampling configuration

Review application logs

High latency overhead:

Reduce sampling rate

Use batch span processor

Check exporter configuration

Related Skills

prometheus-configuration - For metrics

grafana-dashboards - For visualization

slo-implementation - For latency SLOs

distributed-tracing

SKILL.md

Distributed Tracing

Purpose

When to Use

Distributed Tracing Concepts

Trace Structure

Key Components

Jaeger Setup

Kubernetes Deployment

Docker Compose

Application Instrumentation

OpenTelemetry (Recommended)

Context Propagation

HTTP Headers

Propagation in HTTP Requests

Tempo Setup (Grafana)

Kubernetes Deployment

Sampling Strategies

Probabilistic Sampling

Rate Limiting Sampling

Adaptive Sampling

Trace Analysis

Finding Slow Requests

Finding Errors

Service Dependency Graph

Best Practices

Integration with Logging

Correlated Logs

Troubleshooting

Related Skills

Stop writing automation&scrapers

distributed-tracing

SKILL.md

Distributed Tracing

Purpose

When to Use

Distributed Tracing Concepts

Trace Structure

Key Components

Jaeger Setup

Kubernetes Deployment

Docker Compose

Application Instrumentation

OpenTelemetry (Recommended)

Context Propagation

HTTP Headers

Propagation in HTTP Requests

Tempo Setup (Grafana)

Kubernetes Deployment

Sampling Strategies

Probabilistic Sampling

Rate Limiting Sampling

Adaptive Sampling

Trace Analysis

Finding Slow Requests

Finding Errors

Service Dependency Graph

Best Practices

Integration with Logging

Correlated Logs

Troubleshooting

Related Skills

Let your agent run on any real-world website

Related skills

Stop writing automation&scrapers