SKILL.md

Neural Network Design

Name: neural-network-design
Author: aj-geddes

Overview

This skill covers designing and implementing neural network architectures including CNNs, RNNs, Transformers, and ResNets using PyTorch and TensorFlow, with focus on architecture selection, layer composition, and optimization techniques.

When to Use

Designing custom neural network architectures for computer vision tasks like image classification or object detection

Building sequence models for time series forecasting, natural language processing, or video analysis

Implementing transformer-based models for language understanding or generation tasks

Creating hybrid architectures that combine CNNs, RNNs, and attention mechanisms

Optimizing network depth, width, and skip connections for better training and performance

Selecting appropriate activation functions, normalization layers, and regularization techniques

Core Architecture Types

Feedforward Networks (MLPs): Fully connected layers

Convolutional Networks (CNNs): Image processing

Recurrent Networks (RNNs, LSTMs, GRUs): Sequence processing

Transformers: Self-attention based architecture

Hybrid Models: Combining multiple architecture types

Network Design Principles

Depth vs Width: Trade-offs between layers and units

Skip Connections: Residual networks for deeper training

Normalization: Batch norm, layer norm for stability

Regularization: Dropout, L1/L2 preventing overfitting

Activation Functions: ReLU, GELU, Swish for non-linearity

PyTorch and TensorFlow Implementation

import torch

import torch.nn as nn

import tensorflow as tf

from tensorflow import keras

import numpy as np

import matplotlib.pyplot as plt

# 1. Feedforward Neural Network (MLP)

print("=== 1. Feedforward Neural Network ===")

class MLPPyTorch(nn.Module):

    def __init__(self, input_size, hidden_sizes, output_size):

        super().__init__()

        layers = []

        prev_size = input_size

        for hidden_size in hidden_sizes:

            layers.append(nn.Linear(prev_size, hidden_size))

            layers.append(nn.BatchNorm1d(hidden_size))

            layers.append(nn.ReLU())

            layers.append(nn.Dropout(0.3))

            prev_size = hidden_size

        layers.append(nn.Linear(prev_size, output_size))

        self.model = nn.Sequential(*layers)

    def forward(self, x):

        return self.model(x)

mlp = MLPPyTorch(input_size=784, hidden_sizes=[512, 256, 128], output_size=10)

print(f"MLP Parameters: {sum(p.numel() for p in mlp.parameters()):,}")

# 2. Convolutional Neural Network (CNN)

print("\n=== 2. Convolutional Neural Network ===")

class CNNPyTorch(nn.Module):

    def __init__(self):

        super().__init__()

        # Conv blocks

        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)

        self.bn1 = nn.BatchNorm2d(32)

        self.pool1 = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)

        self.bn2 = nn.BatchNorm2d(64)

        self.pool2 = nn.MaxPool2d(2, 2)

        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)

        self.bn3 = nn.BatchNorm2d(128)

        self.pool3 = nn.MaxPool2d(2, 2)

        # Fully connected layers

        self.fc1 = nn.Linear(128 * 4 * 4, 256)

        self.dropout = nn.Dropout(0.5)

        self.fc2 = nn.Linear(256, 10)

        self.relu = nn.ReLU()

    def forward(self, x):

        x = self.relu(self.bn1(self.conv1(x)))

        x = self.pool1(x)

        x = self.relu(self.bn2(self.conv2(x)))

        x = self.pool2(x)

        x = self.relu(self.bn3(self.conv3(x)))

        x = self.pool3(x)

        x = x.view(x.size(0), -1)

        x = self.relu(self.fc1(x))

        x = self.dropout(x)

        x = self.fc2(x)

        return x

cnn = CNNPyTorch()

print(f"CNN Parameters: {sum(p.numel() for p in cnn.parameters()):,}")

# 3. Recurrent Neural Network (LSTM)

print("\n=== 3. LSTM Network ===")

class LSTMPyTorch(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, output_size):

        super().__init__()

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,

                           batch_first=True, dropout=0.3)

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):

        lstm_out, (h_n, c_n) = self.lstm(x)

        last_hidden = h_n[-1]

        output = self.fc(last_hidden)

        return output

lstm = LSTMPyTorch(input_size=100, hidden_size=128, num_layers=2, output_size=10)

print(f"LSTM Parameters: {sum(p.numel() for p in lstm.parameters()):,}")

# 4. Transformer Block

print("\n=== 4. Transformer Architecture ===")

class TransformerBlock(nn.Module):

    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):

        super().__init__()

        self.attention = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)

        self.norm1 = nn.LayerNorm(d_model)

        self.norm2 = nn.LayerNorm(d_model)

        self.feedforward = nn.Sequential(

            nn.Linear(d_model, d_ff),

            nn.ReLU(),

            nn.Dropout(dropout),

            nn.Linear(d_ff, d_model),

            nn.Dropout(dropout)

        )

    def forward(self, x):

        # Self-attention

        attn_out, _ = self.attention(x, x, x)

        x = self.norm1(x + attn_out)

        # Feedforward

        ff_out = self.feedforward(x)

        x = self.norm2(x + ff_out)

        return x

class TransformerPyTorch(nn.Module):

    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff):

        super().__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)

        self.transformer_blocks = nn.ModuleList([

            TransformerBlock(d_model, num_heads, d_ff)

            for _ in range(num_layers)

        ])

        self.fc = nn.Linear(d_model, 10)

    def forward(self, x):

        x = self.embedding(x)

        for block in self.transformer_blocks:

            x = block(x)

        x = x.mean(dim=1)  # Global average pooling

        x = self.fc(x)

        return x

transformer = TransformerPyTorch(vocab_size=1000, d_model=256, num_heads=8,

                                 num_layers=3, d_ff=512)

print(f"Transformer Parameters: {sum(p.numel() for p in transformer.parameters()):,}")

# 5. Residual Network (ResNet)

print("\n=== 5. Residual Network ===")

class ResidualBlock(nn.Module):

    def __init__(self, in_channels, out_channels, stride=1):

        super().__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1)

        self.bn1 = nn.BatchNorm2d(out_channels)

        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding=1)

        self.bn2 = nn.BatchNorm2d(out_channels)

        self.relu = nn.ReLU()

        self.shortcut = nn.Sequential()

        if stride != 1 or in_channels != out_channels:

            self.shortcut = nn.Sequential(

                nn.Conv2d(in_channels, out_channels, 1, stride=stride),

                nn.BatchNorm2d(out_channels)

            )

    def forward(self, x):

        residual = self.shortcut(x)

        out = self.relu(self.bn1(self.conv1(x)))

        out = self.bn2(self.conv2(out))

        out += residual

        out = self.relu(out)

        return out

class ResNetPyTorch(nn.Module):

    def __init__(self):

        super().__init__()

        self.conv1 = nn.Conv2d(3, 64, 7, stride=2, padding=3)

        self.bn1 = nn.BatchNorm2d(64)

        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)

        self.layer1 = self._make_layer(64, 64, 3, stride=1)

        self.layer2 = self._make_layer(64, 128, 4, stride=2)

        self.layer3 = self._make_layer(128, 256, 6, stride=2)

        self.layer4 = self._make_layer(256, 512, 3, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        self.fc = nn.Linear(512, 10)

    def _make_layer(self, in_channels, out_channels, blocks, stride):

        layers = [ResidualBlock(in_channels, out_channels, stride)]

        for _ in range(1, blocks):

            layers.append(ResidualBlock(out_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):

        x = self.maxpool(self.bn1(self.conv1(x)))

        x = self.layer1(x)

        x = self.layer2(x)

        x = self.layer3(x)

        x = self.layer4(x)

        x = self.avgpool(x)

        x = x.view(x.size(0), -1)

        x = self.fc(x)

        return x

resnet = ResNetPyTorch()

print(f"ResNet Parameters: {sum(p.numel() for p in resnet.parameters()):,}")

# 6. TensorFlow Keras model with custom layers

print("\n=== 6. TensorFlow Keras Model ===")

tf_model = keras.Sequential([

    keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)),

    keras.layers.BatchNormalization(),

    keras.layers.MaxPooling2D((2, 2)),

    keras.layers.Conv2D(64, (3, 3), activation='relu'),

    keras.layers.BatchNormalization(),

    keras.layers.MaxPooling2D((2, 2)),

    keras.layers.Conv2D(128, (3, 3), activation='relu'),

    keras.layers.BatchNormalization(),

    keras.layers.GlobalAveragePooling2D(),

    keras.layers.Dense(256, activation='relu'),

    keras.layers.Dropout(0.5),

    keras.layers.Dense(10, activation='softmax')

])

print(f"TensorFlow Model Parameters: {tf_model.count_params():,}")

tf_model.summary()

# 7. Model comparison

models_info = {

    'MLP': mlp,

    'CNN': cnn,

    'LSTM': lstm,

    'Transformer': transformer,

    'ResNet': resnet,

}

param_counts = {name: sum(p.numel() for p in model.parameters())

                for name, model in models_info.items()}

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Parameter counts

axes[0].barh(list(param_counts.keys()), list(param_counts.values()), color='steelblue')

axes[0].set_xlabel('Number of Parameters')

axes[0].set_title('Model Complexity Comparison')

axes[0].set_xscale('log')

# Architecture comparison table

architectures = {

    'MLP': 'Feedforward, Dense layers',

    'CNN': 'Conv layers, Pooling',

    'LSTM': 'Recurrent, Long-term memory',

    'Transformer': 'Self-attention, Parallel processing',

    'ResNet': 'Residual connections, Skip paths'

}

y_pos = np.arange(len(architectures))

axes[1].axis('off')

table_data = [[name, architectures[name]] for name in architectures.keys()]

table = axes[1].table(cellText=table_data, colLabels=['Model', 'Architecture'],

                      cellLoc='left', loc='center', bbox=[0, 0, 1, 1])

table.auto_set_font_size(False)

table.set_fontsize(9)

table.scale(1, 2)

plt.tight_layout()

plt.savefig('neural_network_architectures.png', dpi=100, bbox_inches='tight')

print("\nVisualization saved as 'neural_network_architectures.png'")

print("\nNeural network design analysis complete!")

Architecture Selection Guide

MLP: Tabular data, simple classification

CNN: Image classification, object detection

LSTM/GRU: Time series, sequential data

Transformer: NLP, long-range dependencies

ResNet: Very deep networks, image tasks

Key Design Considerations

Input/output shape compatibility

Receptive field size for CNNs

Sequence length for RNNs

Attention head count for Transformers

Skip connection placement for ResNets

Deliverables

Network architecture definition

Parameter count analysis

Layer-by-layer description

Data flow diagrams

Performance benchmarks

Deployment requirements

neural-network-design