smart-ocr

>

INSTALLATION
npx skills add https://github.com/claude-office-skills/skills --skill smart-ocr
Run in your project or agent environment. Adjust flags if your CLI version differs.

SKILL.md

Smart OCR Skill

Overview

This skill enables intelligent text extraction from images and scanned documents using PaddleOCR - a leading OCR engine supporting 100+ languages. Extract text from photos, screenshots, scanned PDFs, and handwritten documents with high accuracy.

How to Use

  • Provide the image or scanned document
  • Optionally specify language(s) to detect
  • I'll extract text with position and confidence data

Example prompts:

  • "Extract all text from this screenshot"
  • "OCR this scanned PDF document"
  • "Read the text from this business card photo"
  • "Extract Chinese and English text from this image"

Domain Knowledge

PaddleOCR Fundamentals

from paddleocr import PaddleOCR

# Initialize OCR engine

ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Run OCR on image

result = ocr.ocr('image.png', cls=True)

# Result structure: [[box, (text, confidence)], ...]

for line in result[0]:

    box = line[0]      # [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]

    text = line[1][0]  # Extracted text

    conf = line[1][1]  # Confidence score

    print(f"{text} ({conf:.2f})")

Supported Languages

# Common language codes

languages = {

    'en': 'English',

    'ch': 'Chinese (Simplified)',

    'cht': 'Chinese (Traditional)',

    'japan': 'Japanese',

    'korean': 'Korean',

    'french': 'French',

    'german': 'German',

    'spanish': 'Spanish',

    'russian': 'Russian',

    'arabic': 'Arabic',

    'hindi': 'Hindi',

    'vi': 'Vietnamese',

    'th': 'Thai',

    # ... 100+ languages supported

}

# Use specific language

ocr = PaddleOCR(lang='ch')  # Chinese

ocr = PaddleOCR(lang='japan')  # Japanese

ocr = PaddleOCR(lang='multilingual')  # Auto-detect

Configuration Options

from paddleocr import PaddleOCR

ocr = PaddleOCR(

    # Detection settings

    det_model_dir=None,         # Custom detection model

    det_limit_side_len=960,     # Max side length for detection

    det_db_thresh=0.3,          # Binarization threshold

    det_db_box_thresh=0.5,      # Box score threshold

    # Recognition settings

    rec_model_dir=None,         # Custom recognition model

    rec_char_dict_path=None,    # Custom character dictionary

    # Angle classification

    use_angle_cls=True,         # Enable angle classification

    cls_model_dir=None,         # Custom classification model

    # Language

    lang='en',                  # Language code

    # Performance

    use_gpu=True,               # Use GPU if available

    gpu_mem=500,                # GPU memory limit (MB)

    enable_mkldnn=True,         # CPU optimization

    # Output

    show_log=False,             # Suppress logs

)

Processing Different Sources

#### Image Files

# Single image

result = ocr.ocr('image.png')

# Multiple images

images = ['img1.png', 'img2.png', 'img3.png']

for img in images:

    result = ocr.ocr(img)

    process_result(result)

#### PDF Files (Scanned)

from pdf2image import convert_from_path

def ocr_pdf(pdf_path):

    """OCR a scanned PDF."""

    # Convert PDF pages to images

    images = convert_from_path(pdf_path)

    all_text = []

    for i, img in enumerate(images):

        # Save temp image

        temp_path = f'temp_page_{i}.png'

        img.save(temp_path)

        # OCR the image

        result = ocr.ocr(temp_path)

        # Extract text

        page_text = '\n'.join([line[1][0] for line in result[0]])

        all_text.append(f"--- Page {i+1} ---\n{page_text}")

        os.remove(temp_path)

    return '\n\n'.join(all_text)

#### URLs and Bytes

import requests

from io import BytesIO

# From URL

response = requests.get('https://example.com/image.png')

result = ocr.ocr(BytesIO(response.content))

# From bytes

with open('image.png', 'rb') as f:

    img_bytes = f.read()

result = ocr.ocr(BytesIO(img_bytes))

Result Processing

def process_ocr_result(result):

    """Process OCR result into structured data."""

    lines = []

    for line in result[0]:

        box = line[0]

        text = line[1][0]

        confidence = line[1][1]

        # Calculate bounding box

        x_coords = [p[0] for p in box]

        y_coords = [p[1] for p in box]

        lines.append({

            'text': text,

            'confidence': confidence,

            'bbox': {

                'left': min(x_coords),

                'top': min(y_coords),

                'right': max(x_coords),

                'bottom': max(y_coords),

            },

            'raw_box': box

        })

    return lines

# Sort by position (top to bottom, left to right)

def sort_by_position(lines):

    return sorted(lines, key=lambda x: (x['bbox']['top'], x['bbox']['left']))

Text Layout Reconstruction

def reconstruct_layout(result, line_threshold=10):

    """Reconstruct text layout from OCR results."""

    lines = process_ocr_result(result)

    lines = sort_by_position(lines)

    # Group into logical lines

    text_lines = []

    current_line = []

    current_y = None

    for line in lines:

        y = line['bbox']['top']

        if current_y is None or abs(y - current_y) < line_threshold:

            current_line.append(line)

            current_y = y

        else:

            # New line

            text_lines.append(' '.join([l['text'] for l in current_line]))

            current_line = [line]

            current_y = y

    # Add last line

    if current_line:

        text_lines.append(' '.join([l['text'] for l in current_line]))

    return '\n'.join(text_lines)

Best Practices

  • Preprocess Images: Improve quality before OCR
  • Choose Correct Language: Specify language for better accuracy
  • Handle Multi-column: Process columns separately
  • Filter Low Confidence: Skip results below threshold
  • Batch Processing: Process multiple images efficiently

Common Patterns

Image Preprocessing

from PIL import Image, ImageEnhance, ImageFilter

def preprocess_image(image_path):

    """Preprocess image for better OCR."""

    img = Image.open(image_path)

    # Convert to grayscale

    img = img.convert('L')

    # Enhance contrast

    enhancer = ImageEnhance.Contrast(img)

    img = enhancer.enhance(2.0)

    # Sharpen

    img = img.filter(ImageFilter.SHARPEN)

    # Save preprocessed

    preprocessed_path = 'preprocessed.png'

    img.save(preprocessed_path)

    return preprocessed_path

Batch OCR with Progress

from tqdm import tqdm

from concurrent.futures import ThreadPoolExecutor

def batch_ocr(image_paths, max_workers=4):

    """OCR multiple images in parallel."""

    results = {}

    def process_single(img_path):

        result = ocr.ocr(img_path)

        return img_path, result

    with ThreadPoolExecutor(max_workers=max_workers) as executor:

        futures = [executor.submit(process_single, p) for p in image_paths]

        for future in tqdm(futures, desc="Processing OCR"):

            path, result = future.result()

            results[path] = result

    return results

Examples

Example 1: Business Card Reader

from paddleocr import PaddleOCR

import re

def read_business_card(image_path):

    """Extract contact info from business card."""

    ocr = PaddleOCR(use_angle_cls=True, lang='en')

    result = ocr.ocr(image_path)

    # Extract all text

    all_text = []

    for line in result[0]:

        all_text.append(line[1][0])

    full_text = '\n'.join(all_text)

    # Parse contact info

    contact = {

        'name': None,

        'email': None,

        'phone': None,

        'company': None,

        'title': None,

        'raw_text': full_text

    }

    # Email pattern

    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', full_text)

    if email_match:

        contact['email'] = email_match.group()

    # Phone pattern

    phone_match = re.search(r'[\+\d][\d\s\-\(\)]{8,}', full_text)

    if phone_match:

        contact['phone'] = phone_match.group().strip()

    # Name is usually the largest/first text

    if all_text:

        contact['name'] = all_text[0]

    return contact

card_info = read_business_card('business_card.jpg')

print(f"Name: {card_info['name']}")

print(f"Email: {card_info['email']}")

print(f"Phone: {card_info['phone']}")

Example 2: Receipt Scanner

from paddleocr import PaddleOCR

import re

def scan_receipt(image_path):

    """Extract items and total from receipt."""

    ocr = PaddleOCR(use_angle_cls=True, lang='en')

    result = ocr.ocr(image_path)

    lines = []

    for line in result[0]:

        text = line[1][0]

        y_pos = line[0][0][1]

        lines.append({'text': text, 'y': y_pos})

    # Sort by vertical position

    lines.sort(key=lambda x: x['y'])

    receipt = {

        'items': [],

        'subtotal': None,

        'tax': None,

        'total': None

    }

    for line in lines:

        text = line['text']

        # Look for total

        if 'total' in text.lower():

            amount = re.search(r'\$?([\d,]+\.?\d*)', text)

            if amount:

                if 'sub' in text.lower():

                    receipt['subtotal'] = float(amount.group(1).replace(',', ''))

                else:

                    receipt['total'] = float(amount.group(1).replace(',', ''))

        # Look for tax

        elif 'tax' in text.lower():

            amount = re.search(r'\$?([\d,]+\.?\d*)', text)

            if amount:

                receipt['tax'] = float(amount.group(1).replace(',', ''))

        # Look for items (line with price)

        else:

            item_match = re.search(r'(.+?)\s+\$?([\d,]+\.?\d+)$', text)

            if item_match:

                receipt['items'].append({

                    'name': item_match.group(1).strip(),

                    'price': float(item_match.group(2).replace(',', ''))

                })

    return receipt

receipt_data = scan_receipt('receipt.jpg')

print(f"Items: {len(receipt_data['items'])}")

print(f"Total: ${receipt_data['total']}")

Example 3: Multi-language Document

from paddleocr import PaddleOCR

def ocr_multilingual(image_path, languages=['en', 'ch']):

    """OCR document with multiple languages."""

    all_results = {}

    for lang in languages:

        ocr = PaddleOCR(use_angle_cls=True, lang=lang)

        result = ocr.ocr(image_path)

        texts = []

        for line in result[0]:

            texts.append({

                'text': line[1][0],

                'confidence': line[1][1]

            })

        all_results[lang] = texts

    # Merge results, keeping highest confidence

    merged = {}

    for lang, texts in all_results.items():

        for item in texts:

            text = item['text']

            conf = item['confidence']

            if text not in merged or merged[text]['confidence'] < conf:

                merged[text] = {'confidence': conf, 'language': lang}

    return merged

result = ocr_multilingual('bilingual_document.png')

for text, info in result.items():

    print(f"[{info['language']}] {text} ({info['confidence']:.2f})")

Limitations

  • Handwritten text accuracy varies
  • Very small text may not be detected
  • Complex backgrounds reduce accuracy
  • Rotated text needs angle classification
  • GPU recommended for best performance

Installation

# CPU version

pip install paddlepaddle paddleocr

# GPU version (CUDA 11.x)

pip install paddlepaddle-gpu paddleocr

# Additional dependencies

pip install pdf2image Pillow

Resources

BrowserAct

Let your agent run on any real-world website

Bypass CAPTCHA & anti-bot for free. Start local, scale to cloud.

Explore BrowserAct Skills →

Stop writing automation&scrapers

Install the CLI. Run your first Skill in 30 seconds. Scale when you're ready.

Start free
free · no credit card