pytorch-patterns

PyTorch deep learning patterns and best practices for building robust, efficient, and reproducible training pipelines, model architectures, and data loading.

INSTALLATION
npx skills add https://github.com/affaan-m/everything-claude-code --skill pytorch-patterns
Run in your project or agent environment. Adjust flags if your CLI version differs.

SKILL.md

PyTorch Development Patterns

Idiomatic PyTorch patterns and best practices for building robust, efficient, and reproducible deep learning applications.

When to Activate

  • Writing new PyTorch models or training scripts
  • Reviewing deep learning code
  • Debugging training loops or data pipelines
  • Optimizing GPU memory usage or training speed
  • Setting up reproducible experiments

Core Principles

1. Device-Agnostic Code

Always write code that works on both CPU and GPU without hardcoding devices.

# Good: Device-agnostic

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MyModel().to(device)

data = data.to(device)

# Bad: Hardcoded device

model = MyModel().cuda()  # Crashes if no GPU

data = data.cuda()

2. Reproducibility First

Set all random seeds for reproducible results.

# Good: Full reproducibility setup

def set_seed(seed: int = 42) -> None:

    torch.manual_seed(seed)

    torch.cuda.manual_seed_all(seed)

    np.random.seed(seed)

    random.seed(seed)

    torch.backends.cudnn.deterministic = True

    torch.backends.cudnn.benchmark = False

# Bad: No seed control

model = MyModel()  # Different weights every run

3. Explicit Shape Management

Always document and verify tensor shapes.

# Good: Shape-annotated forward pass

def forward(self, x: torch.Tensor) -> torch.Tensor:

    # x: (batch_size, channels, height, width)

    x = self.conv1(x)    # -> (batch_size, 32, H, W)

    x = self.pool(x)     # -> (batch_size, 32, H//2, W//2)

    x = x.view(x.size(0), -1)  # -> (batch_size, 32*H//2*W//2)

    return self.fc(x)    # -> (batch_size, num_classes)

# Bad: No shape tracking

def forward(self, x):

    x = self.conv1(x)

    x = self.pool(x)

    x = x.view(x.size(0), -1)  # What size is this?

    return self.fc(x)           # Will this even work?

Model Architecture Patterns

Clean nn.Module Structure

# Good: Well-organized module

class ImageClassifier(nn.Module):

    def __init__(self, num_classes: int, dropout: float = 0.5) -> None:

        super().__init__()

        self.features = nn.Sequential(

            nn.Conv2d(3, 64, kernel_size=3, padding=1),

            nn.BatchNorm2d(64),

            nn.ReLU(inplace=True),

            nn.MaxPool2d(2),

        )

        self.classifier = nn.Sequential(

            nn.Dropout(dropout),

            nn.Linear(64 * 16 * 16, num_classes),

        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:

        x = self.features(x)

        x = x.view(x.size(0), -1)

        return self.classifier(x)

# Bad: Everything in forward

class ImageClassifier(nn.Module):

    def __init__(self):

        super().__init__()

    def forward(self, x):

        x = F.conv2d(x, weight=self.make_weight())  # Creates weight each call!

        return x

Proper Weight Initialization

# Good: Explicit initialization

def _init_weights(self, module: nn.Module) -> None:

    if isinstance(module, nn.Linear):

        nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")

        if module.bias is not None:

            nn.init.zeros_(module.bias)

    elif isinstance(module, nn.Conv2d):

        nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")

    elif isinstance(module, nn.BatchNorm2d):

        nn.init.ones_(module.weight)

        nn.init.zeros_(module.bias)

model = MyModel()

model.apply(model._init_weights)

Training Loop Patterns

Standard Training Loop

# Good: Complete training loop with best practices

def train_one_epoch(

    model: nn.Module,

    dataloader: DataLoader,

    optimizer: torch.optim.Optimizer,

    criterion: nn.Module,

    device: torch.device,

    scaler: torch.amp.GradScaler | None = None,

) -> float:

    model.train()  # Always set train mode

    total_loss = 0.0

    for batch_idx, (data, target) in enumerate(dataloader):

        data, target = data.to(device), target.to(device)

        optimizer.zero_grad(set_to_none=True)  # More efficient than zero_grad()

        # Mixed precision training

        with torch.amp.autocast("cuda", enabled=scaler is not None):

            output = model(data)

            loss = criterion(output, target)

        if scaler is not None:

            scaler.scale(loss).backward()

            scaler.unscale_(optimizer)

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            scaler.step(optimizer)

            scaler.update()

        else:

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

Validation Loop

# Good: Proper evaluation

@torch.no_grad()  # More efficient than wrapping in torch.no_grad() block

def evaluate(

    model: nn.Module,

    dataloader: DataLoader,

    criterion: nn.Module,

    device: torch.device,

) -> tuple[float, float]:

    model.eval()  # Always set eval mode — disables dropout, uses running BN stats

    total_loss = 0.0

    correct = 0

    total = 0

    for data, target in dataloader:

        data, target = data.to(device), target.to(device)

        output = model(data)

        total_loss += criterion(output, target).item()

        correct += (output.argmax(1) == target).sum().item()

        total += target.size(0)

    return total_loss / len(dataloader), correct / total

Data Pipeline Patterns

Custom Dataset

# Good: Clean Dataset with type hints

class ImageDataset(Dataset):

    def __init__(

        self,

        image_dir: str,

        labels: dict[str, int],

        transform: transforms.Compose | None = None,

    ) -> None:

        self.image_paths = list(Path(image_dir).glob("*.jpg"))

        self.labels = labels

        self.transform = transform

    def __len__(self) -> int:

        return len(self.image_paths)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, int]:

        img = Image.open(self.image_paths[idx]).convert("RGB")

        label = self.labels[self.image_paths[idx].stem]

        if self.transform:

            img = self.transform(img)

        return img, label

Efficient DataLoader Configuration

# Good: Optimized DataLoader

dataloader = DataLoader(

    dataset,

    batch_size=32,

    shuffle=True,            # Shuffle for training

    num_workers=4,           # Parallel data loading

    pin_memory=True,         # Faster CPU->GPU transfer

    persistent_workers=True, # Keep workers alive between epochs

    drop_last=True,          # Consistent batch sizes for BatchNorm

)

# Bad: Slow defaults

dataloader = DataLoader(dataset, batch_size=32)  # num_workers=0, no pin_memory

Custom Collate for Variable-Length Data

# Good: Pad sequences in collate_fn

def collate_fn(batch: list[tuple[torch.Tensor, int]]) -> tuple[torch.Tensor, torch.Tensor]:

    sequences, labels = zip(*batch)

    # Pad to max length in batch

    padded = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0)

    return padded, torch.tensor(labels)

dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)

Checkpointing Patterns

Save and Load Checkpoints

# Good: Complete checkpoint with all training state

def save_checkpoint(

    model: nn.Module,

    optimizer: torch.optim.Optimizer,

    epoch: int,

    loss: float,

    path: str,

) -> None:

    torch.save({

        "epoch": epoch,

        "model_state_dict": model.state_dict(),

        "optimizer_state_dict": optimizer.state_dict(),

        "loss": loss,

    }, path)

def load_checkpoint(

    path: str,

    model: nn.Module,

    optimizer: torch.optim.Optimizer | None = None,

) -> dict:

    checkpoint = torch.load(path, map_location="cpu", weights_only=True)

    model.load_state_dict(checkpoint["model_state_dict"])

    if optimizer:

        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

    return checkpoint

# Bad: Only saving model weights (can't resume training)

torch.save(model.state_dict(), "model.pt")

Performance Optimization

Mixed Precision Training

# Good: AMP with GradScaler

scaler = torch.amp.GradScaler("cuda")

for data, target in dataloader:

    with torch.amp.autocast("cuda"):

        output = model(data)

        loss = criterion(output, target)

    scaler.scale(loss).backward()

    scaler.step(optimizer)

    scaler.update()

    optimizer.zero_grad(set_to_none=True)

Gradient Checkpointing for Large Models

# Good: Trade compute for memory

from torch.utils.checkpoint import checkpoint

class LargeModel(nn.Module):

    def forward(self, x: torch.Tensor) -> torch.Tensor:

        # Recompute activations during backward to save memory

        x = checkpoint(self.block1, x, use_reentrant=False)

        x = checkpoint(self.block2, x, use_reentrant=False)

        return self.head(x)

torch.compile for Speed

# Good: Compile the model for faster execution (PyTorch 2.0+)

model = MyModel().to(device)

model = torch.compile(model, mode="reduce-overhead")

# Modes: "default" (safe), "reduce-overhead" (faster), "max-autotune" (fastest)

Quick Reference: PyTorch Idioms

Idiom

Description

model.train() / model.eval()

Always set mode before train/eval

torch.no_grad()

Disable gradients for inference

optimizer.zero_grad(set_to_none=True)

More efficient gradient clearing

.to(device)

Device-agnostic tensor/model placement

torch.amp.autocast

Mixed precision for 2x speed

pin_memory=True

Faster CPU→GPU data transfer

torch.compile

JIT compilation for speed (2.0+)

weights_only=True

Secure model loading

torch.manual_seed

Reproducible experiments

gradient_checkpointing

Trade compute for memory

Anti-Patterns to Avoid

# Bad: Forgetting model.eval() during validation

model.train()

with torch.no_grad():

    output = model(val_data)  # Dropout still active! BatchNorm uses batch stats!

# Good: Always set eval mode

model.eval()

with torch.no_grad():

    output = model(val_data)

# Bad: In-place operations breaking autograd

x = F.relu(x, inplace=True)  # Can break gradient computation

x += residual                  # In-place add breaks autograd graph

# Good: Out-of-place operations

x = F.relu(x)

x = x + residual

# Bad: Moving data to GPU inside the training loop repeatedly

for data, target in dataloader:

    model = model.cuda()  # Moves model EVERY iteration!

# Good: Move model once before the loop

model = model.to(device)

for data, target in dataloader:

    data, target = data.to(device), target.to(device)

# Bad: Using .item() before backward

loss = criterion(output, target).item()  # Detaches from graph!

loss.backward()  # Error: can't backprop through .item()

# Good: Call .item() only for logging

loss = criterion(output, target)

loss.backward()

print(f"Loss: {loss.item():.4f}")  # .item() after backward is fine

# Bad: Not using torch.save properly

torch.save(model, "model.pt")  # Saves entire model (fragile, not portable)

# Good: Save state_dict

torch.save(model.state_dict(), "model.pt")

Remember: PyTorch code should be device-agnostic, reproducible, and memory-conscious. When in doubt, profile with torch.profiler and check GPU memory with torch.cuda.memory_summary().

BrowserAct

Let your agent run on any real-world website

Bypass CAPTCHA & anti-bot for free. Start local, scale to cloud.

Explore BrowserAct Skills →

Stop writing automation&scrapers

Install the CLI. Run your first Skill in 30 seconds. Scale when you're ready.

Start free
free · no credit card