SKILL.md

W&B Primary Skill

Name: wandb-primary
Author: wandb

Environment defaults

Python: run scripts with python, install packages with uv add

LLM: OpenAI gpt-5.4 (reasoning: high, endpoint: responses)

Fast recipes — use these first

These cover the most common tasks. Each is a single script. Copy, fill in placeholders, run.

Count runs (exact, fast)

import wandb, os

api = wandb.Api(timeout=60)

path = f"{os.environ['WANDB_ENTITY']}/{os.environ['WANDB_PROJECT']}"

total = len(api.runs(path, per_page=1, include_sweeps=False, lazy=True))

finished = len(api.runs(path, filters={"state": "finished"}, per_page=1, include_sweeps=False, lazy=True))

crashed = len(api.runs(path, filters={"state": "crashed"}, per_page=1, include_sweeps=False, lazy=True))

running = len(api.runs(path, filters={"state": "running"}, per_page=1, include_sweeps=False, lazy=True))

print(f"Total: {total}  |  Finished: {finished}  |  Crashed: {crashed}  |  Running: {running}")

Count traces (fast, server-side)

import weave, os, logging

logging.getLogger("weave").setLevel(logging.ERROR)

from weave.trace_server.trace_server_interface import CallsQueryStatsReq

entity = os.environ["WANDB_ENTITY"]

project = os.environ["WANDB_PROJECT"]

client = weave.init(f"{entity}/{project}")

pid = f"{entity}/{project}"

# Total root traces

stats = client.server.calls_query_stats(CallsQueryStatsReq(

    project_id=pid, filter={"trace_roots_only": True}

))

print(f"Root traces: {stats.count}")

# Count by op name

for op in ["Evaluation.evaluate", "my_op.turn"]:

    s = client.server.calls_query_stats(CallsQueryStatsReq(

        project_id=pid,

        filter={"op_names": [f"weave:///{entity}/{project}/op/{op}:*"]},

    ))

    print(f"  {op}: {s.count}")

Summarize project (runs + traces in one script)

import wandb, weave, os, logging

logging.getLogger("weave").setLevel(logging.ERROR)

from weave.trace_server.trace_server_interface import CallsQueryStatsReq

entity = os.environ["WANDB_ENTITY"]

project = os.environ["WANDB_PROJECT"]

path = f"{entity}/{project}"

# --- Runs ---

api = wandb.Api(timeout=60)

total_runs = len(api.runs(path, per_page=1, include_sweeps=False, lazy=True))

finished = len(api.runs(path, filters={"state": "finished"}, per_page=1, include_sweeps=False, lazy=True))

recent = api.runs(path, order="-created_at", per_page=5)[:5]

print(f"=== Runs ({total_runs} total, {finished} finished) ===")

for r in recent:

    print(f"  {r.name} [{r.state}] {r.created_at[:10]}")

# --- Weave Traces ---

client = weave.init(path)

pid = f"{entity}/{project}"

root_stats = client.server.calls_query_stats(CallsQueryStatsReq(

    project_id=pid, filter={"trace_roots_only": True}

))

print(f"\n=== Weave Traces ({root_stats.count} root traces) ===")

recent_calls = list(client.get_calls(

    sort_by=[{"field": "started_at", "direction": "desc"}],

    limit=5,

    columns=["op_name", "started_at", "display_name"],

))

for c in recent_calls:

    name = c.display_name or c.op_name.split("/")[-1].split(":")[0]

    started = c.started_at.strftime("%Y-%m-%d %H:%M") if c.started_at else "?"

    print(f"  {name} @ {started}")

Inspect a single run

import wandb, os

api = wandb.Api(timeout=60)

path = f"{os.environ['WANDB_ENTITY']}/{os.environ['WANDB_PROJECT']}"

run = api.run(f"{path}/RUN_ID")

print(f"Name: {run.name}")

print(f"State: {run.state}")

print(f"Created: {run.created_at}")

print(f"Tags: {run.tags}")

print(f"Last step: {run.lastHistoryStep}")

# Key metrics (replace with actual keys from probe or user request)

for k in ["loss", "val_loss", "accuracy"]:

    v = run.summary_metrics.get(k)

    if v is not None:

        print(f"  {k}: {v}")

Compare two runs

import wandb, os, sys

sys.path.insert(0, "skills/wandb-primary/scripts")

from wandb_helpers import get_api, compare_configs

api = get_api()

path = f"{os.environ['WANDB_ENTITY']}/{os.environ['WANDB_PROJECT']}"

run_a = api.run(f"{path}/RUN_A_ID")

run_b = api.run(f"{path}/RUN_B_ID")

# Config diff

diffs = compare_configs(run_a, run_b)

if diffs:

    print("Config differences:")

    for d in diffs:

        print(f"  {d['key']}: {d[run_a.name]} -> {d[run_b.name]}")

else:

    print("Configs are identical")

# Metric comparison

print("\nMetrics:")

for k in ["loss", "val_loss", "accuracy"]:

    a = run_a.summary_metrics.get(k, "N/A")

    b = run_b.summary_metrics.get(k, "N/A")

    print(f"  {k}: {a} vs {b}")

Summarize latest eval

import weave, os, sys, logging

logging.getLogger("weave").setLevel(logging.ERROR)

from weave.trace.weave_client import CallsFilter

sys.path.insert(0, "skills/wandb-primary/scripts")

from weave_helpers import unwrap, eval_results_to_dicts, results_summary

entity = os.environ["WANDB_ENTITY"]

project = os.environ["WANDB_PROJECT"]

client = weave.init(f"{entity}/{project}")

# Get latest eval

op_ref = f"weave:///{entity}/{project}/op/Evaluation.evaluate:*"

evals = list(client.get_calls(

    filter=CallsFilter(op_names=[op_ref]),

    sort_by=[{"field": "started_at", "direction": "desc"}],

    limit=1,

))

if not evals:

    print("No evaluations found")

else:

    ec = evals[0]

    print(f"Eval: {ec.display_name or 'unnamed'} @ {ec.started_at}")

    # Get predict_and_score children

    pas_ref = f"weave:///{entity}/{project}/op/Evaluation.predict_and_score:*"

    pas = list(client.get_calls(

        filter=CallsFilter(op_names=[pas_ref], parent_ids=[ec.id])

    ))

    results = eval_results_to_dicts(pas, agent_name=ec.display_name or "agent")

    print(results_summary(results))

Inspect recent traces

import weave, os, logging

logging.getLogger("weave").setLevel(logging.ERROR)

sys.path.insert(0, "skills/wandb-primary/scripts")

from weave_helpers import unwrap, get_token_usage

entity = os.environ["WANDB_ENTITY"]

project = os.environ["WANDB_PROJECT"]

client = weave.init(f"{entity}/{project}")

calls = list(client.get_calls(

    sort_by=[{"field": "started_at", "direction": "desc"}],

    limit=10,

))

for c in calls:

    name = c.display_name or c.op_name.split("/")[-1].split(":")[0]

    started = c.started_at.strftime("%Y-%m-%d %H:%M") if c.started_at else "?"

    duration = ""

    if c.started_at and c.ended_at:

        duration = f" ({(c.ended_at - c.started_at).total_seconds():.1f}s)"

    status = c.summary.get("weave", {}).get("status", "?") if c.summary else "?"

    tokens = get_token_usage(c)

    tok_str = f" [{tokens['total_tokens']} tok]" if tokens['total_tokens'] else ""

    print(f"  {name} [{status}] {started}{duration}{tok_str}")

Create a W&B Report

import wandb, os

from wandb.apis import reports as wr

entity = os.environ["WANDB_ENTITY"]

project = os.environ["WANDB_PROJECT"]

runset = wr.Runset(entity=entity, project=project, name="All runs")

plots = wr.PanelGrid(

    runsets=[runset],

    panels=[

        wr.LinePlot(title="Loss", x="_step", y=["LOSS_KEY"]),

        wr.BarPlot(title="Accuracy", metrics=["ACC_KEY"], orientation="v"),

    ],

)

report = wr.Report(

    entity=entity,

    project=project,

    title="Project Analysis",

    description="Auto-generated summary",

    width="fixed",

    blocks=[

        wr.H1(text="Project Analysis"),

        wr.P(text="Auto-generated summary from W&#x26;B API."),

        plots,

    ],

)

report.save(draft=True)

print(f"Report saved: {report.url}")

Set up a Weave Monitor

import weave, os

entity = os.environ["WANDB_ENTITY"]

project = os.environ["WANDB_PROJECT"]

client = weave.init(f"{entity}/{project}")

# Define a scorer

@weave.op()

def my_scorer(output: dict) -> dict:

    """Score based on output quality."""

    # Replace with actual scoring logic

    passed = output.get("succeeded", False)

    return {"passed": passed, "score": 1.0 if passed else 0.0}

# Create monitor

monitor = weave.Monitor(

    entity=entity,

    project=project,

    name="quality-monitor",

    scorers=[my_scorer],

    # Filter which ops to monitor:

    # op_names=["my_agent.run"],

)

print(f"Monitor created: {monitor.name}")

Relaunch a run (1 command, auto-selects queue)

python skills/wandb-primary/scripts/launch_helpers.py relaunch \

  "https://wandb.ai/entity/project/runs/run_id" \

  --config '{"epochs": 100}'

Relaunch a run (Python)

import sys

sys.path.insert(0, "skills/wandb-primary/scripts")

from launch_helpers import parse_run_url, list_queues, relaunch_run

entity, project, run_id = parse_run_url("RUN_URL")

run_path = f"{entity}/{project}/{run_id}"

queues = list_queues(entity)

queue = queues[0]  # use the recommended queue

relaunch_run(

    run_path=run_path,

    queue_name=queue["name"],

    namespace=queue["namespace"],

    config={"lr": 0.001, "epochs": 20},

)

Modify code and launch

import sys

sys.path.insert(0, "skills/wandb-primary/scripts")

from launch_helpers import get_job_artifact, download_code_artifact, list_queues, create_and_launch_modified_job

# Step 1: Download code

info = download_code_artifact("entity/project/job-name:latest")

# Edit files in info["code_dir"]...

# Step 2: Launch modified code

queues = list_queues("ENTITY")

queue = queues[0]

create_and_launch_modified_job(

    code_dir=info["code_dir"],

    entrypoint=info["entrypoint"],

    entity=info["entity"], project=info["project"],

    queue_name=queue["name"], namespace=queue["namespace"],

    job_name="my-modified-job",

    base_image=info["base_image"],

)

Check status of a launched run

python skills/wandb-primary/scripts/launch_helpers.py check \

  "entity" "project" "queue-name" "QUEUE_ITEM_ID"

Launch rules

Minimize turns. For a simple relaunch with config changes, use ONE command: python skills/wandb-primary/scripts/launch_helpers.py relaunch <URL> --config '{"epochs": 100}'

The CLI auto-handles everything. relaunch auto-discovers queues, selects the best one, finds the job artifact, and submits.

**Do NOT read launch_helpers.py.** This SKILL.md documents everything you need.

Do NOT check WANDB_ENTITY/PROJECT env vars for launch. The run URL contains the entity and project.

**NEVER fake a launch with wandb.init().** Use relaunch_run() or the CLI.

NEVER run training locally in the sandbox. No GPU. Always use Launch.

Config change vs code change — decide FIRST.

Change type

Examples

How to launch

Config override

epochs, lr, batch_size, any value in wandb.config

relaunch_run(..., config={"epochs": 100})

Code change

model architecture, loss function, data augmentation

Download code -> edit -> create_and_launch_modified_job()

Important: If the user asks to change something that isn't a config field (e.g. "add more conv layers", "change the optimizer"), you MUST modify the code. Passing unknown config keys does nothing — the training script doesn't read them.

Launch decision tree

I need to...

Do this

Change hyperparameters only

relaunch_run(run_path, queue_name, namespace, config={"epochs": 100})

Change code (architecture, logic)

download_code_artifact() -> edit files -> create_and_launch_modified_job()

Launch from an artifact path

launch_job_artifact(artifact_path, queue_name)

Submit new code (no existing run)

submit_code_artifact_job(code_files, entrypoint, ...)

Check on a launched run

check_launched_run(entity, project, queue_name, item_id)

Find a queue

list_queues(entity) — use the recommended one

Create a new queue

create_queue(name, entity, gpus=1, cpu=8, memory="80Gi")

Step-by-step launch (when CLI one-liner isn't enough)

import sys

sys.path.insert(0, "skills/wandb-primary/scripts")

from launch_helpers import parse_run_url, list_queues, get_job_artifact, relaunch_run, submit_code_artifact_job

# 1. Parse the run URL

entity, project, run_id = parse_run_url("https://wandb.ai/entity/project/runs/run_id")

run_path = f"{entity}/{project}/{run_id}"

# 2. Find a queue (also gives namespace)

queues = list_queues(entity)

queue = queues[0]

# 3. Check for a job artifact

job_artifact = get_job_artifact(run_path)

# 4a. If job artifact exists -> relaunch with config overrides

if job_artifact:

    relaunch_run(run_path, queue["name"], queue["namespace"],

                 config={"lr": 0.001, "epochs": 20})

# 4b. If no job artifact -> submit code directly

else:

    submit_code_artifact_job(

        code_files=["train.py"], entrypoint="python train.py",

        entity=entity, project=project,

        queue_name=queue["name"], job_name="my-train-job",

        base_image="pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime",

        requirements=["wandb"],

    )

Code change workflow — two scripts, not one

Script 1: Download and read the code (so you can see what to edit):

import sys

sys.path.insert(0, "skills/wandb-primary/scripts")

from launch_helpers import parse_run_url, get_job_artifact, download_code_artifact

import os

entity, project, run_id = parse_run_url("RUN_URL")

art = get_job_artifact(f"{entity}/{project}/{run_id}")

info = download_code_artifact(f"{entity}/{project}/{art.name}")

# Prints: code_dir, files, entrypoint, base_image

for f in info["files"]:

    print(f"\n=== {f} ===")

    with open(os.path.join(info["code_dir"], f)) as fh:

        print(fh.read())

Then edit the files in info["code_dir"] using write_file or apply_patch.

Script 2: Launch the modified code:

import sys

sys.path.insert(0, "skills/wandb-primary/scripts")

from launch_helpers import list_queues, create_and_launch_modified_job

queues = list_queues("ENTITY")

queue = queues[0]

create_and_launch_modified_job(

    code_dir="CODE_DIR_FROM_STEP_1",

    entrypoint="python model.py",

    entity="ENTITY", project="PROJECT",

    queue_name=queue["name"], namespace=queue["namespace"],

    job_name="JOB_NAME", base_image="BASE_IMAGE",

)

Launch infrastructure gotchas

**Always pass resource_args explicitly** — queue defaults get double-nested by the server

Restart agent after queue delete/recreate — agent loses its registration

**requirements.txt is read from code dir** — _create_job does NOT inspect the venv

**Keep requirements.txt minimal** — only deps not in base image

**Build base images for linux/amd64** — not the default on Mac

**Inject K8s secrets via k8s_secrets param** — not via queue defaults

CRITICAL: Large project performance rules

These rules prevent 502 errors, timeouts, and multi-minute hangs on projects with 10K+ runs or runs with 1K+ metrics. Violating any of these will cause failures on large projects.

**Always use wandb.Api(timeout=60)** — the default 19s timeout causes constant failures

**NEVER call history() or scan_history() without explicit keys=[...]** — runs with 1K+ metrics will 502 or timeout when fetching all columns

**Use per_page=min(limit, 1000)** when calling api.runs() for list tasks, and use per_page=1 for exact count tasks

Prefer server-side filters (summary_metrics.X: {$gt: Y}) over client-side iteration

**For exact counts, prefer len(api.runs(..., per_page=1, include_sweeps=False, lazy=True))** — never len(list(runs))

**Use beta_scan_history** for runs with 10K+ history steps — reads from parquet, not GraphQL

Never iterate all config keys unless explicitly needed — access specific keys by name

**Default to include_sweeps=False for read-only retrieval tasks**

**Use calls_query_stats for trace counts** — never materialize all calls just to count them

When to use what

I need to...

Use

Query training runs, loss curves, hyperparameters

W&B SDK (wandb.Api()) — see references/WANDB_SDK.md

Query GenAI traces, calls, evaluations

Weave SDK (weave.init(), client.get_calls()) — see references/WEAVE_SDK.md

Convert Weave wrapper types to plain Python

**weave_helpers.unwrap()**

Build a DataFrame from training runs

**wandb_helpers.fetch_runs() (fast) or wandb_helpers.runs_to_dataframe()**

Extract eval results for analysis

**weave_helpers.eval_results_to_dicts()**

Count traces without fetching them

**calls_query_stats** from Weave server API

Need low-level Weave filtering (CallsFilter, Query)

Raw Weave SDK — see references/WEAVE_SDK.md

Create a report

**wandb.apis.reports**

Set up production monitoring

**weave.Monitor**

Reproduce/relaunch a run

**launch_helpers.relaunch_run()** or CLI

Launch a training job on GPU/K8s

**launch_helpers.submit_code_artifact_job()**

Modify code and launch

**launch_helpers.download_code_artifact() -> edit -> create_and_launch_modified_job()**

List or create launch queues

**launch_helpers.list_queues() / create_queue()**

Bundled files

Helper libraries

import sys

sys.path.insert(0, "skills/wandb-primary/scripts")

# Weave helpers (traces, evals, GenAI)

from weave_helpers import (

    unwrap,                  # Recursively convert Weave types -> plain Python

    get_token_usage,         # Extract token counts from a call's summary

    eval_results_to_dicts,   # predict_and_score calls -> list of result dicts

    pivot_solve_rate,        # Build task-level pivot table across agents

    results_summary,         # Print compact eval summary

    eval_health,             # Extract status/counts from Evaluation.evaluate calls

    eval_efficiency,         # Compute tokens-per-success across eval calls

)

# W&#x26;B helpers (training runs, metrics) — large-project optimized

from wandb_helpers import (

    get_api,             # Create API with safe timeout (default 60s)

    probe_project,       # Discover project scale, metrics, config BEFORE querying

    fetch_runs,          # FAST: Direct GraphQL with selective metrics (17x faster)

    runs_to_dataframe,   # Legacy: iterate run objects (slower, use fetch_runs instead)

    diagnose_run,        # Quick diagnostic summary (configurable metric keys)

    compare_configs,     # Side-by-side config diff between two runs

    scan_history,        # Smart history scan (auto-selects beta_scan_history for large runs)

)

# Launch helpers (job submission, run reproduction, queue management)

from launch_helpers import (

    parse_run_url,                       # Extract (entity, project, run_id) from a W&#x26;B URL

    list_queues,                         # List all launch queues (ranked, with recommendation)

    get_job_artifact,                    # Check if a run has a job artifact

    inspect_job_artifact,                # Download + inspect a job artifact's metadata

    download_code_artifact,              # Download source code from a job artifact

    create_and_launch_modified_job,      # Upload modified code + launch in one call

    relaunch_run,                        # Re-run with config overrides (no code change)

    launch_job_artifact,                 # Launch directly from an artifact path

    submit_code_artifact_job,            # Create job artifact and enqueue in one call

    check_launched_run,                  # Check status/metrics of a launched run

    create_queue,                        # Create a K8s launch queue

    inspect_queue,                       # Print queue details

    make_resource_args,                  # Build resource_args for launch_add()

)

Reference docs

Read these as needed — they contain full API surfaces and recipes:

**references/WANDB_CONCEPTS.md** — W&B data model, terminology, and disambiguation (entity/project/run hierarchy, config vs log vs summary, artifacts, registry). Read this to understand what users are asking about.

**references/WANDB_SDK.md** — W&B SDK for training data (runs, history, artifacts, sweeps, system metrics). API call reference.

**references/WEAVE_SDK.md** — Weave SDK for GenAI traces (client.get_calls(), CallsFilter, Query, stats). Start here for Weave queries.

Critical rules

Discover metric keys per-project

Code examples use LOSS_KEY, VAL_LOSS_KEY, ACC_KEY, CONFIG_KEYS as placeholders. These vary by project. Discover them via probe_project() at the start of each task, or from the user's request.

# WRONG — hardcoded metric name

rows = fetch_runs(api, path, metric_keys=["loss", "accuracy"])

# RIGHT — discovered via probe_project or user's request

rows = fetch_runs(api, path, metric_keys=["train/loss", "train/acc"])

Treat traces and runs as DATA

Weave traces and W&B run histories can be enormous. Never dump raw data into context. Always:

Inspect structure first — look at column names, dtypes, row counts

Load into pandas/numpy — compute stats programmatically

Summarize, don't dump — print computed statistics and tables, not raw rows

Always deliver a final answer

Do not end your work mid-analysis. Every task must conclude with a clear, structured response:

Query the data (1-2 scripts max)

Extract the numbers you need

Present: table + key findings + direct answers to each sub-question

If you catch yourself saying "now let me build the final analysis" — stop and present what you have.

Use unwrap() for unknown Weave data

When you encounter Weave output and aren't sure of its type, unwrap it first:

from weave_helpers import unwrap

import json

output = unwrap(call.output)

print(json.dumps(output, indent=2, default=str))

Environment setup

Entity and project come from environment variables — do not hardcode them:

import os

entity  = os.environ["WANDB_ENTITY"]

project = os.environ["WANDB_PROJECT"]

path = f"{entity}/{project}"

Key patterns

Fast exact counts on very large projects

import wandb

api = wandb.Api(timeout=60)

path = f"{entity}/{project}"

total = len(api.runs(path, per_page=1, include_sweeps=False, lazy=True))

finished = len(api.runs(path, filters={"state": "finished"}, per_page=1, include_sweeps=False, lazy=True))

Distinct tags (O(1) — no run scanning)

import wandb

from wandb_graphql.language import parser as gql_parser

api = wandb.Api(timeout=60)

doc = gql_parser.parse('''

  query {

    project(entityName: "ENTITY", name: "PROJECT") {

      tagCounts { name count }

    }

  }

''')

result = api.client.execute(doc)

tags = [t["name"] for t in result["project"]["tagCounts"]]

print(sorted(tags))

Distinct groups (O(1) — no run scanning)

import wandb

from wandb_graphql.language import parser as gql_parser

api = wandb.Api(timeout=60)

doc = gql_parser.parse('''

  query {

    project(entityName: "ENTITY", name: "PROJECT") {

      groupedRuns(groupKeys: ["group"], first: 100) {

        ... on GroupedRunConnection {

          edges {

            node { group totalRuns }

          }

        }

      }

    }

  }

''')

result = api.client.execute(doc)

edges = result["project"]["groupedRuns"]["edges"]

groups = [e["node"]["group"] for e in edges if e["node"]["group"]]

print(sorted(groups))

W&B SDK — fast run fetching (17x faster on large projects)

import pandas as pd

from wandb_helpers import get_api, fetch_runs

api = get_api()

path = f"{entity}/{project}"

rows = fetch_runs(

    api, path,

    metric_keys=["LOSS_KEY", "ACC_KEY"],

    filters={"state": "finished"},

    limit=100,

)

df = pd.DataFrame(rows)

print(df.describe())

Weave — eval call hierarchy

Evaluation.evaluate (root)

  +-- Evaluation.predict_and_score (one per dataset row x trials)

  |     +-- model.predict (the actual model call)

  |     +-- scorer_1.score

  |     +-- scorer_2.score

  +-- Evaluation.summarize

Token usage

from weave_helpers import get_token_usage

usage = get_token_usage(call)

print(f"Tokens: {usage['total_tokens']} (in={usage['input_tokens']}, out={usage['output_tokens']})")

Report authoring (W&B Reports)

from wandb.apis import reports as wr

runset = wr.Runset(entity=entity, project=project, name="All runs")

plots = wr.PanelGrid(

    runsets=[runset],

    panels=[

        wr.LinePlot(title="Loss", x="_step", y=["LOSS_KEY"]),

        wr.BarPlot(title="Accuracy", metrics=["ACC_KEY"], orientation="v"),

    ],

)

report = wr.Report(

    entity=entity, project=project,

    title="Project analysis",

    description="Summary of recent runs",

    width="fixed",

    blocks=[

        wr.H1(text="Project analysis"),

        wr.P(text="Auto-generated summary from W&#x26;B API."),

        plots,

    ],

)

report.save(draft=True)

Gotchas

Weave API

Gotcha

Wrong

Right

weave.init args

weave.init(project="x")

weave.init("x") (positional)

Parent filter

filter={'parent_id': 'x'}

filter={'parent_ids': ['x']} (plural, list)

WeaveObject access

rubric.get('passed')

getattr(rubric, 'passed', None)

Nested output

out.get('succeeded')

out.get('output').get('succeeded') (output.output)

ObjectRef comparison

name_ref == "foo"

str(name_ref) == "foo"

CallsFilter import

from weave import CallsFilter

from weave.trace.weave_client import CallsFilter

Query import

from weave import Query

from weave.trace_server.interface.query import Query

Eval status path

summary["status"]

summary["weave"]["status"]

Eval success count

summary["success_count"]

summary["weave"]["status_counts"]["success"]

When in doubt

Guess the type

unwrap() first, then inspect

W&B API

Gotcha

Wrong

Right

API timeout

wandb.Api() (19s default)

wandb.Api(timeout=60) or get_api()

Summary access

run.summary["loss"]

run.summary_metrics.get("LOSS_KEY")

Loading all runs

list(api.runs(...))

runs[:200] (always slice)

Counting runs

len(list(api.runs(...)))

len(api.runs(..., per_page=1, include_sweeps=False, lazy=True))

Distinct tags

iterate all runs collecting run.tags

GraphQL tagCounts query

Distinct groups

iterate all runs collecting run.group

GraphQL groupedRuns query

run.config after lazy fetch

run.config returns {}

Use lazy=False when you need config

Pagination

api.runs(path) (per_page=50 default)

api.runs(path, per_page=min(N, 1000))

History — no keys on large run

run.history(samples=10) -> 502

run.history(samples=10, keys=["LOSS_KEY"])

scan_history — no keys

scan_history() -> timeout

scan_history(keys=["LOSS_KEY"])

Large history (10K+ steps)

scan_history(keys=[...])

beta_scan_history(keys=[...]) (parquet)

Cross-run search

iterate all runs client-side

Server-side filter: {"summary_metrics.X": {"$gt": Y}}

Launch

Gotcha

Wrong

Right

List queues

api.run_queues() or raw GQL

list_queues(entity) from helpers

resource_args

Rely on queue defaults

Pass via make_resource_args()

requirements.txt

pip freeze from venv

Write manually — only deps missing from base image

Base image arch

docker build on Mac

docker buildx build --platform linux/amd64

Fake launch

wandb.init() with config

relaunch_run() or launch_job_artifact()

Unknown config key

relaunch_run(config={"conv_layers": 4})

Code change — download, edit, create_and_launch_modified_job()

Weave logging noise

import logging

logging.getLogger("weave").setLevel(logging.ERROR)

wandb-primary

SKILL.md

W&#x26;B Primary Skill

Environment defaults

Fast recipes — use these first

Count runs (exact, fast)

Count traces (fast, server-side)

Summarize project (runs + traces in one script)

Inspect a single run

Compare two runs

Summarize latest eval

Inspect recent traces

Create a W&#x26;B Report

Set up a Weave Monitor

Relaunch a run (1 command, auto-selects queue)

Relaunch a run (Python)

Modify code and launch

Check status of a launched run

Launch rules

Launch decision tree

Step-by-step launch (when CLI one-liner isn't enough)

Code change workflow — two scripts, not one

Launch infrastructure gotchas

CRITICAL: Large project performance rules

When to use what

Bundled files

Helper libraries

Reference docs

Critical rules

Discover metric keys per-project

Treat traces and runs as DATA

Always deliver a final answer

Use unwrap() for unknown Weave data

Environment setup

Key patterns

Fast exact counts on very large projects

Distinct tags (O(1) — no run scanning)

Distinct groups (O(1) — no run scanning)

W&#x26;B SDK — fast run fetching (17x faster on large projects)

Weave — eval call hierarchy

Token usage

Report authoring (W&#x26;B Reports)

Gotchas

Weave API

W&#x26;B API

Launch

Weave logging noise

Let your agent run on any real-world website

Related skills

Stop writing automation&scrapers

W&B Primary Skill

Create a W&B Report

W&B SDK — fast run fetching (17x faster on large projects)

Report authoring (W&B Reports)

W&B API