computer-vision

Implement computer vision tasks including image classification, object detection, segmentation, and pose estimation using PyTorch and TensorFlow

INSTALLATION
npx skills add https://github.com/aj-geddes/useful-ai-prompts --skill computer-vision
Run in your project or agent environment. Adjust flags if your CLI version differs.

SKILL.md

Computer Vision

Overview

Computer vision enables machines to understand visual information from images and videos, powering applications like autonomous driving, medical imaging, and surveillance.

When to Use

  • Image classification and object recognition tasks
  • Object detection and localization in images
  • Semantic or instance segmentation projects
  • Pose estimation and human activity recognition
  • Face recognition and biometric systems
  • Medical imaging analysis and diagnostics

Computer Vision Tasks

  • Image Classification: Categorizing images into classes
  • Object Detection: Locating and classifying objects in images
  • Semantic Segmentation: Pixel-level classification
  • Instance Segmentation: Detecting individual object instances
  • Pose Estimation: Identifying human body joints
  • Face Recognition: Identifying individuals in images

Popular Architectures

  • Classification: ResNet, VGG, EfficientNet, Vision Transformer
  • Detection: YOLO, Faster R-CNN, SSD, RetinaNet
  • Segmentation: U-Net, DeepLab, Mask R-CNN
  • Pose: OpenPose, PoseNet, HRNet

Python Implementation

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import matplotlib.patches as patches

from PIL import Image, ImageDraw

import torch

import torch.nn as nn

from torch.utils.data import DataLoader, TensorDataset

from torchvision import transforms, models, datasets

import tensorflow as tf

from tensorflow import keras

from tensorflow.keras import layers

import cv2

from sklearn.metrics import accuracy_score, confusion_matrix

import seaborn as sns

import warnings

warnings.filterwarnings('ignore')

print("=== 1. Image Classification CNN ===")

# Define image classification model

class ImageClassifierCNN(nn.Module):

    def __init__(self, num_classes=10):

        super().__init__()

        self.features = nn.Sequential(

            nn.Conv2d(3, 32, kernel_size=3, padding=1),

            nn.ReLU(inplace=True),

            nn.BatchNorm2d(32),

            nn.MaxPool2d(2, 2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),

            nn.ReLU(inplace=True),

            nn.BatchNorm2d(64),

            nn.MaxPool2d(2, 2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),

            nn.ReLU(inplace=True),

            nn.BatchNorm2d(128),

            nn.MaxPool2d(2, 2),

        )

        self.classifier = nn.Sequential(

            nn.Linear(128 * 4 * 4, 256),

            nn.ReLU(inplace=True),

            nn.Dropout(0.5),

            nn.Linear(256, num_classes)

        )

    def forward(self, x):

        x = self.features(x)

        x = x.view(x.size(0), -1)

        x = self.classifier(x)

        return x

model = ImageClassifierCNN(num_classes=10)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# 2. Object Detection setup

print("\n=== 2. Object Detection Framework ===")

class ObjectDetector(nn.Module):

    def __init__(self):

        super().__init__()

        # Backbone

        self.backbone = nn.Sequential(

            nn.Conv2d(3, 32, 3, padding=1),

            nn.ReLU(),

            nn.MaxPool2d(2, 2),

            nn.Conv2d(32, 64, 3, padding=1),

            nn.ReLU(),

            nn.MaxPool2d(2, 2),

        )

        # Bounding box regression

        self.bbox_head = nn.Sequential(

            nn.Linear(64 * 8 * 8, 128),

            nn.ReLU(),

            nn.Linear(128, 4)  # x, y, w, h

        )

        # Class prediction

        self.class_head = nn.Sequential(

            nn.Linear(64 * 8 * 8, 128),

            nn.ReLU(),

            nn.Linear(128, 10)  # 10 classes

        )

    def forward(self, x):

        features = self.backbone(x)

        features_flat = features.view(features.size(0), -1)

        bboxes = self.bbox_head(features_flat)

        classes = self.class_head(features_flat)

        return bboxes, classes

detector = ObjectDetector()

print(f"Detector parameters: {sum(p.numel() for p in detector.parameters()):,}")

# 3. Semantic Segmentation

print("\n=== 3. Semantic Segmentation U-Net ===")

class UNet(nn.Module):

    def __init__(self, num_classes=5):

        super().__init__()

        # Encoder

        self.enc1 = self._conv_block(3, 32)

        self.pool1 = nn.MaxPool2d(2, 2)

        self.enc2 = self._conv_block(32, 64)

        self.pool2 = nn.MaxPool2d(2, 2)

        # Bottleneck

        self.bottleneck = self._conv_block(64, 128)

        # Decoder

        self.upconv2 = nn.ConvTranspose2d(128, 64, 2, stride=2)

        self.dec2 = self._conv_block(128, 64)

        self.upconv1 = nn.ConvTranspose2d(64, 32, 2, stride=2)

        self.dec1 = self._conv_block(64, 32)

        # Final output

        self.out = nn.Conv2d(32, num_classes, 1)

    def _conv_block(self, in_channels, out_channels):

        return nn.Sequential(

            nn.Conv2d(in_channels, out_channels, 3, padding=1),

            nn.ReLU(inplace=True),

            nn.Conv2d(out_channels, out_channels, 3, padding=1),

            nn.ReLU(inplace=True)

        )

    def forward(self, x):

        enc1 = self.enc1(x)

        enc2 = self.enc2(self.pool1(enc1))

        bottleneck = self.bottleneck(self.pool2(enc2))

        dec2 = self.dec2(torch.cat([self.upconv2(bottleneck), enc2], 1))

        dec1 = self.dec1(torch.cat([self.upconv1(dec2), enc1], 1))

        return self.out(dec1)

unet = UNet(num_classes=5)

print(f"U-Net parameters: {sum(p.numel() for p in unet.parameters()):,}")

# 4. Transfer Learning

print("\n=== 4. Transfer Learning with Pre-trained Models ===")

try:

    # Load pre-trained ResNet18

    pretrained_model = models.resnet18(pretrained=True)

    num_ftrs = pretrained_model.fc.in_features

    pretrained_model.fc = nn.Linear(num_ftrs, 10)

    print(f"Pre-trained ResNet18 adapted for 10 classes")

    print(f"Parameters: {sum(p.numel() for p in pretrained_model.parameters()):,}")

except:

    print("Pre-trained models not available")

# 5. Image preprocessing and augmentation

print("\n=== 5. Image Preprocessing and Augmentation ===")

transform_basic = transforms.Compose([

    transforms.Resize((224, 224)),

    transforms.ToTensor(),

    transforms.Normalize(mean=[0.485, 0.456, 0.406],

                        std=[0.229, 0.224, 0.225])

])

transform_augmented = transforms.Compose([

    transforms.RandomRotation(20),

    transforms.RandomHorizontalFlip(),

    transforms.ColorJitter(brightness=0.2, contrast=0.2),

    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),

    transforms.Resize((224, 224)),

    transforms.ToTensor(),

    transforms.Normalize(mean=[0.485, 0.456, 0.406],

                        std=[0.229, 0.224, 0.225])

])

print("Augmentation transforms defined")

# 6. Synthetic image data

print("\n=== 6. Synthetic Image Data Creation ===")

def create_synthetic_images(num_images=100, img_size=32):

    """Create synthetic images with shapes"""

    images = []

    labels = []

    for _ in range(num_images):

        img = np.ones((img_size, img_size, 3)) * 255

        # Randomly draw shapes

        shape_type = np.random.randint(0, 3)

        if shape_type == 0:  # Circle

            center = (np.random.randint(5, img_size-5), np.random.randint(5, img_size-5))

            radius = np.random.randint(3, 10)

            cv2.circle(img, center, radius, (0, 0, 0), -1)

            labels.append(0)

        elif shape_type == 1:  # Rectangle

            pt1 = (np.random.randint(0, img_size-10), np.random.randint(0, img_size-10))

            pt2 = (pt1[0] + np.random.randint(5, 15), pt1[1] + np.random.randint(5, 15))

            cv2.rectangle(img, pt1, pt2, (0, 0, 0), -1)

            labels.append(1)

        else:  # Triangle

            pts = np.array([[np.random.randint(0, img_size), np.random.randint(0, img_size)],

                           [np.random.randint(0, img_size), np.random.randint(0, img_size)],

                           [np.random.randint(0, img_size), np.random.randint(0, img_size)]])

            cv2.drawContours(img, [pts], 0, (0, 0, 0), -1)

            labels.append(2)

        images.append(img.astype(np.float32) / 255.0)

    return np.array(images), np.array(labels)

X_images, y_labels = create_synthetic_images(num_images=300, img_size=32)

print(f"Synthetic dataset: {X_images.shape}, Labels: {y_labels.shape}")

print(f"Class distribution: {np.bincount(y_labels)}")

# 7. Visualization

print("\n=== 7. Visualization ===")

fig, axes = plt.subplots(3, 3, figsize=(12, 10))

# Display synthetic images

for i in range(9):

    idx = i % len(X_images)

    axes[i // 3, i % 3].imshow(X_images[idx])

    axes[i // 3, i % 3].set_title(f"Class {y_labels[idx]}")

    axes[i // 3, i % 3].axis('off')

plt.suptitle("Synthetic Image Dataset", fontsize=14, fontweight='bold')

plt.tight_layout()

plt.savefig('synthetic_images.png', dpi=100, bbox_inches='tight')

print("Synthetic images saved as 'synthetic_images.png'")

# 8. Model architectures comparison

print("\n=== 8. Architecture Comparison ===")

architectures_info = {

    'CNN': ImageClassifierCNN(),

    'ObjectDetector': ObjectDetector(),

    'U-Net': UNet(),

}

arch_data = {

    'Architecture': list(architectures_info.keys()),

    'Parameters': [sum(p.numel() for p in m.parameters()) for m in architectures_info.values()],

    'Use Case': ['Classification', 'Object Detection', 'Segmentation']

}

arch_df = pd.DataFrame(arch_data)

print("\nArchitecture Comparison:")

print(arch_df.to_string(index=False))

# Visualization

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Parameters comparison

axes[0].barh(arch_df['Architecture'], arch_df['Parameters'], color='steelblue')

axes[0].set_xlabel('Number of Parameters')

axes[0].set_title('Model Complexity Comparison')

axes[0].set_xscale('log')

# Use cases

use_cases = ['Classification', 'Detection', 'Segmentation',

            'Classification', 'Detection', 'Segmentation']

colors_map = {'Classification': 'green', 'Detection': 'orange', 'Segmentation': 'red'}

bar_colors = [colors_map[uc] for uc in arch_df['Use Case']]

axes[1].bar(arch_df['Architecture'], [1, 1, 1], color=bar_colors, alpha=0.7)

axes[1].set_ylabel('Primary Task')

axes[1].set_title('Architecture Use Cases')

axes[1].set_ylim([0, 1.5])

plt.tight_layout()

plt.savefig('cv_architecture_comparison.png', dpi=100, bbox_inches='tight')

print("\nArchitecture comparison saved as 'cv_architecture_comparison.png'")

# 9. Bounding box visualization

print("\n=== 9. Bounding Box Visualization ===")

fig, ax = plt.subplots(figsize=(10, 8))

ax.imshow(X_images[0])

# Draw sample bounding boxes

bboxes = [

    (5, 5, 15, 15),   # x1, y1, x2, y2

    (18, 10, 28, 20),

    (8, 20, 18, 28)

]

for bbox in bboxes:

    rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2]-bbox[0], bbox[3]-bbox[1],

                            linewidth=2, edgecolor='red', facecolor='none')

    ax.add_patch(rect)

ax.set_title('Bounding Box Detection Example')

ax.axis('off')

plt.savefig('bounding_boxes.png', dpi=100, bbox_inches='tight')

print("Bounding box visualization saved as 'bounding_boxes.png'")

print("\nComputer vision setup completed!")

Common CV Architectures

  • Classification: ResNet, EfficientNet, Vision Transformer
  • Detection: YOLO v5, Faster R-CNN, RetinaNet
  • Segmentation: U-Net, DeepLab v3, Mask R-CNN
  • Tracking: SORT, DeepSORT, ByteTrack

Image Preprocessing

  • Resizing to standard dimensions
  • Normalization with ImageNet stats
  • Data augmentation (rotation, flip, crop)
  • Color space conversion

Evaluation Metrics

  • Classification: Accuracy, Precision, Recall, F1
  • Detection: mAP (mean Average Precision), IoU
  • Segmentation: IoU, Dice coefficient, Hausdorff distance

Deliverables

  • Trained vision model
  • Inference pipeline
  • Performance evaluation
  • Visualization results
  • Model optimization report
  • Deployment guide
BrowserAct

Let your agent run on any real-world website

Bypass CAPTCHA & anti-bot for free. Start local, scale to cloud.

Explore BrowserAct Skills →

Stop writing automation&scrapers

Install the CLI. Run your first Skill in 30 seconds. Scale when you're ready.

Start free
free · no credit card