Name: ml-model-training
Author: aj-geddes

SKILL.md

$27

Python Implementation

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (accuracy_score, precision_score, recall_score,

                            f1_score, confusion_matrix, roc_auc_score)

import torch

import torch.nn as nn

from torch.utils.data import DataLoader, TensorDataset

import tensorflow as tf

from tensorflow import keras

# 1. Generate synthetic dataset

np.random.seed(42)

n_samples = 1000

n_features = 20

X = np.random.randn(n_samples, n_features)

y = (X[:, 0] + X[:, 1] - X[:, 2] + np.random.randn(n_samples) * 0.5 > 0).astype(int)

# Split data

X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, random_state=42

)

# Normalize features

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

print("Dataset shapes:")

print(f"Training: {X_train_scaled.shape}, Testing: {X_test_scaled.shape}")

print(f"Class distribution: {np.bincount(y_train)}")

# 2. Scikit-learn models

print("\n=== Scikit-learn Models ===")

models = {

    'Logistic Regression': LogisticRegression(max_iter=1000),

    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),

    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),

}

sklearn_results = {}

for name, model in models.items():

    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    sklearn_results[name] = {

        'accuracy': accuracy_score(y_test, y_pred),

        'precision': precision_score(y_test, y_pred),

        'recall': recall_score(y_test, y_pred),

        'f1': f1_score(y_test, y_pred),

        'roc_auc': roc_auc_score(y_test, y_pred_proba)

    }

    print(f"\n{name}:")

    for metric, value in sklearn_results[name].items():

        print(f"  {metric}: {value:.4f}")

# 3. PyTorch neural network

print("\n=== PyTorch Model ===")

class NeuralNetPyTorch(nn.Module):

    def __init__(self, input_size):

        super().__init__()

        self.fc1 = nn.Linear(input_size, 64)

        self.fc2 = nn.Linear(64, 32)

        self.fc3 = nn.Linear(32, 1)

        self.relu = nn.ReLU()

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):

        x = self.relu(self.fc1(x))

        x = self.dropout(x)

        x = self.relu(self.fc2(x))

        x = self.dropout(x)

        x = torch.sigmoid(self.fc3(x))

        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

pytorch_model = NeuralNetPyTorch(n_features).to(device)

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(pytorch_model.parameters(), lr=0.001)

# Create data loaders

train_dataset = TensorDataset(torch.FloatTensor(X_train_scaled),

                             torch.FloatTensor(y_train).unsqueeze(1))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Train PyTorch model

epochs = 50

pytorch_losses = []

for epoch in range(epochs):

    total_loss = 0

    for batch_X, batch_y in train_loader:

        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()

        outputs = pytorch_model(batch_X)

        loss = criterion(outputs, batch_y)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    pytorch_losses.append(total_loss / len(train_loader))

    if (epoch + 1) % 10 == 0:

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {pytorch_losses[-1]:.4f}")

# Evaluate PyTorch

pytorch_model.eval()

with torch.no_grad():

    y_pred_pytorch = pytorch_model(torch.FloatTensor(X_test_scaled).to(device))

    y_pred_pytorch = (y_pred_pytorch.cpu().numpy() > 0.5).astype(int).flatten()

    print(f"\nPyTorch Accuracy: {accuracy_score(y_test, y_pred_pytorch):.4f}")

# 4. TensorFlow/Keras model

print("\n=== TensorFlow/Keras Model ===")

tf_model = keras.Sequential([

    keras.layers.Dense(64, activation='relu', input_shape=(n_features,)),

    keras.layers.Dropout(0.3),

    keras.layers.Dense(32, activation='relu'),

    keras.layers.Dropout(0.3),

    keras.layers.Dense(1, activation='sigmoid')

])

tf_model.compile(

    optimizer='adam',

    loss='binary_crossentropy',

    metrics=['accuracy']

)

history = tf_model.fit(

    X_train_scaled, y_train,

    batch_size=32,

    epochs=50,

    validation_split=0.2,

    verbose=0

)

y_pred_tf = (tf_model.predict(X_test_scaled) > 0.5).astype(int).flatten()

print(f"TensorFlow Accuracy: {accuracy_score(y_test, y_pred_tf):.4f}")

# 5. Visualization

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Model comparison

models_names = list(sklearn_results.keys()) + ['PyTorch', 'TensorFlow']

accuracies = [sklearn_results[m]['accuracy'] for m in sklearn_results.keys()] + \

             [accuracy_score(y_test, y_pred_pytorch),

              accuracy_score(y_test, y_pred_tf)]

axes[0, 0].bar(range(len(models_names)), accuracies, color='steelblue')

axes[0, 0].set_xticks(range(len(models_names)))

axes[0, 0].set_xticklabels(models_names, rotation=45)

axes[0, 0].set_ylabel('Accuracy')

axes[0, 0].set_title('Model Comparison')

axes[0, 0].set_ylim([0, 1])

# Training loss curves

axes[0, 1].plot(pytorch_losses, label='PyTorch', linewidth=2)

axes[0, 1].plot(history.history['loss'], label='TensorFlow', linewidth=2)

axes[0, 1].set_xlabel('Epoch')

axes[0, 1].set_ylabel('Loss')

axes[0, 1].set_title('Training Loss Comparison')

axes[0, 1].legend()

axes[0, 1].grid(True, alpha=0.3)

# Scikit-learn metrics

metrics = ['accuracy', 'precision', 'recall', 'f1']

rf_metrics = [sklearn_results['Random Forest'][m] for m in metrics]

axes[1, 0].bar(metrics, rf_metrics, color='coral')

axes[1, 0].set_ylabel('Score')

axes[1, 0].set_title('Random Forest Metrics')

axes[1, 0].set_ylim([0, 1])

# Validation accuracy over epochs

axes[1, 1].plot(history.history['accuracy'], label='Training', linewidth=2)

axes[1, 1].plot(history.history['val_accuracy'], label='Validation', linewidth=2)

axes[1, 1].set_xlabel('Epoch')

axes[1, 1].set_ylabel('Accuracy')

axes[1, 1].set_title('TensorFlow Training History')

axes[1, 1].legend()

axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()

plt.savefig('model_training_comparison.png', dpi=100, bbox_inches='tight')

print("\nVisualization saved as 'model_training_comparison.png'")

print("\nModel training completed!")

Training Best Practices

Data Split: 70/15/15 for train/validation/test

Scaling: Normalize features before training

Cross-validation: Use K-fold for robust evaluation

Early Stopping: Prevent overfitting

Class Balancing: Handle imbalanced datasets

Key Metrics

Accuracy: Overall correctness

Precision: Positive prediction accuracy

Recall: True positive detection rate

F1 Score: Harmonic mean of precision/recall

ROC-AUC: Threshold-independent metric

Deliverables

Trained model checkpoint

Performance metrics on test set

Feature importance analysis

Learning curves

Hyperparameter configuration

Model evaluation report

ml-model-training

SKILL.md

Python Implementation

Training Best Practices

Key Metrics

Deliverables

Let your agent run on any real-world website

Related skills

Stop writing automation&scrapers