doc-parser

>

INSTALLATION
npx skills add https://github.com/claude-office-skills/skills --skill doc-parser
Run in your project or agent environment. Adjust flags if your CLI version differs.

SKILL.md

Document Parser Skill

Overview

This skill enables advanced document parsing using docling - IBM's state-of-the-art document understanding library. Parse complex PDFs, Word documents, and images while preserving structure, extracting tables, figures, and handling multi-column layouts.

How to Use

  • Provide the document to parse
  • Specify what you want to extract (text, tables, figures, etc.)
  • I'll parse it and return structured data

Example prompts:

  • "Parse this PDF and extract all tables"
  • "Convert this academic paper to structured markdown"
  • "Extract figures and captions from this document"
  • "Parse this report preserving the document structure"

Domain Knowledge

docling Fundamentals

from docling.document_converter import DocumentConverter

# Initialize converter

converter = DocumentConverter()

# Convert document

result = converter.convert("document.pdf")

# Access parsed content

doc = result.document

print(doc.export_to_markdown())

Supported Formats

Format

Extension

Notes

PDF

.pdf

Native and scanned

Word

.docx

Full structure preserved

PowerPoint

.pptx

Slides as sections

Images

.png, .jpg

OCR + layout analysis

HTML

.html

Structure preserved

Basic Usage

from docling.document_converter import DocumentConverter

# Create converter

converter = DocumentConverter()

# Convert single document

result = converter.convert("report.pdf")

# Access document

doc = result.document

# Export options

markdown = doc.export_to_markdown()

text = doc.export_to_text()

json_doc = doc.export_to_dict()

Advanced Configuration

from docling.document_converter import DocumentConverter

from docling.datamodel.base_models import InputFormat

from docling.datamodel.pipeline_options import PdfPipelineOptions

# Configure pipeline

pipeline_options = PdfPipelineOptions()

pipeline_options.do_ocr = True

pipeline_options.do_table_structure = True

pipeline_options.table_structure_options.do_cell_matching = True

# Create converter with options

converter = DocumentConverter(

    allowed_formats=[InputFormat.PDF, InputFormat.DOCX],

    pdf_backend_options=pipeline_options

)

result = converter.convert("document.pdf")

Document Structure

# Document hierarchy

doc = result.document

# Access metadata

print(doc.name)

print(doc.origin)

# Iterate through content

for element in doc.iterate_items():

    print(f"Type: {element.type}")

    print(f"Text: {element.text}")

    if element.type == "table":

        print(f"Rows: {len(element.data.table_cells)}")

Extracting Tables

from docling.document_converter import DocumentConverter

import pandas as pd

def extract_tables(doc_path):

    """Extract all tables from document."""

    converter = DocumentConverter()

    result = converter.convert(doc_path)

    doc = result.document

    tables = []

    for element in doc.iterate_items():

        if element.type == "table":

            # Get table data

            table_data = element.export_to_dataframe()

            tables.append({

                'page': element.prov[0].page_no if element.prov else None,

                'dataframe': table_data

            })

    return tables

# Usage

tables = extract_tables("report.pdf")

for i, table in enumerate(tables):

    print(f"Table {i+1} on page {table['page']}:")

    print(table['dataframe'])

Extracting Figures

def extract_figures(doc_path, output_dir):

    """Extract figures with captions."""

    import os

    converter = DocumentConverter()

    result = converter.convert(doc_path)

    doc = result.document

    figures = []

    os.makedirs(output_dir, exist_ok=True)

    for element in doc.iterate_items():

        if element.type == "picture":

            figure_info = {

                'caption': element.caption if hasattr(element, 'caption') else None,

                'page': element.prov[0].page_no if element.prov else None,

            }

            # Save image if available

            if hasattr(element, 'image'):

                img_path = os.path.join(output_dir, f"figure_{len(figures)+1}.png")

                element.image.save(img_path)

                figure_info['path'] = img_path

            figures.append(figure_info)

    return figures

Handling Multi-column Layouts

from docling.document_converter import DocumentConverter

def parse_multicolumn(doc_path):

    """Parse document with multi-column layout."""

    converter = DocumentConverter()

    result = converter.convert(doc_path)

    doc = result.document

    # docling automatically handles column detection

    # Text is returned in reading order

    structured_content = []

    for element in doc.iterate_items():

        content_item = {

            'type': element.type,

            'text': element.text if hasattr(element, 'text') else None,

            'level': element.level if hasattr(element, 'level') else None,

        }

        # Add bounding box if available

        if element.prov:

            content_item['bbox'] = element.prov[0].bbox

            content_item['page'] = element.prov[0].page_no

        structured_content.append(content_item)

    return structured_content

Export Formats

from docling.document_converter import DocumentConverter

converter = DocumentConverter()

result = converter.convert("document.pdf")

doc = result.document

# Markdown export

markdown = doc.export_to_markdown()

with open("output.md", "w") as f:

    f.write(markdown)

# Plain text

text = doc.export_to_text()

# JSON/dict format

json_doc = doc.export_to_dict()

# HTML format (if supported)

# html = doc.export_to_html()

Batch Processing

from docling.document_converter import DocumentConverter

from pathlib import Path

from concurrent.futures import ThreadPoolExecutor

def batch_parse(input_dir, output_dir, max_workers=4):

    """Parse multiple documents in parallel."""

    input_path = Path(input_dir)

    output_path = Path(output_dir)

    output_path.mkdir(exist_ok=True)

    converter = DocumentConverter()

    def process_single(doc_path):

        try:

            result = converter.convert(str(doc_path))

            md = result.document.export_to_markdown()

            out_file = output_path / f"{doc_path.stem}.md"

            with open(out_file, 'w') as f:

                f.write(md)

            return {'file': str(doc_path), 'status': 'success'}

        except Exception as e:

            return {'file': str(doc_path), 'status': 'error', 'error': str(e)}

    docs = list(input_path.glob('*.pdf')) + list(input_path.glob('*.docx'))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:

        results = list(executor.map(process_single, docs))

    return results

Best Practices

  • Use Appropriate Pipeline: Configure for your document type
  • Handle Large Documents: Process in chunks if needed
  • Verify Table Extraction: Complex tables may need review
  • Check OCR Quality: Enable OCR for scanned documents
  • Cache Results: Store parsed documents for reuse

Common Patterns

Academic Paper Parser

def parse_academic_paper(pdf_path):

    """Parse academic paper structure."""

    converter = DocumentConverter()

    result = converter.convert(pdf_path)

    doc = result.document

    paper = {

        'title': None,

        'abstract': None,

        'sections': [],

        'references': [],

        'tables': [],

        'figures': []

    }

    current_section = None

    for element in doc.iterate_items():

        text = element.text if hasattr(element, 'text') else ''

        if element.type == 'title':

            paper['title'] = text

        elif element.type == 'heading':

            if 'abstract' in text.lower():

                current_section = 'abstract'

            elif 'reference' in text.lower():

                current_section = 'references'

            else:

                paper['sections'].append({

                    'title': text,

                    'content': ''

                })

                current_section = 'section'

        elif element.type == 'paragraph':

            if current_section == 'abstract':

                paper['abstract'] = text

            elif current_section == 'section' and paper['sections']:

                paper['sections'][-1]['content'] += text + '\n'

        elif element.type == 'table':

            paper['tables'].append({

                'caption': element.caption if hasattr(element, 'caption') else None,

                'data': element.export_to_dataframe() if hasattr(element, 'export_to_dataframe') else None

            })

    return paper

Report to Structured Data

def parse_business_report(doc_path):

    """Parse business report into structured format."""

    converter = DocumentConverter()

    result = converter.convert(doc_path)

    doc = result.document

    report = {

        'metadata': {

            'title': None,

            'date': None,

            'author': None

        },

        'executive_summary': None,

        'sections': [],

        'key_metrics': [],

        'recommendations': []

    }

    # Parse document structure

    for element in doc.iterate_items():

        # Implement parsing logic based on document structure

        pass

    return report

Examples

Example 1: Parse Financial Report

from docling.document_converter import DocumentConverter

def parse_financial_report(pdf_path):

    """Extract structured data from financial report."""

    converter = DocumentConverter()

    result = converter.convert(pdf_path)

    doc = result.document

    financial_data = {

        'income_statement': None,

        'balance_sheet': None,

        'cash_flow': None,

        'notes': []

    }

    # Extract tables

    tables = []

    for element in doc.iterate_items():

        if element.type == 'table':

            table_df = element.export_to_dataframe()

            # Identify table type

            if 'revenue' in str(table_df).lower() or 'income' in str(table_df).lower():

                financial_data['income_statement'] = table_df

            elif 'asset' in str(table_df).lower() or 'liabilities' in str(table_df).lower():

                financial_data['balance_sheet'] = table_df

            elif 'cash' in str(table_df).lower():

                financial_data['cash_flow'] = table_df

            else:

                tables.append(table_df)

    # Extract markdown for notes

    financial_data['markdown'] = doc.export_to_markdown()

    return financial_data

report = parse_financial_report('annual_report.pdf')

print("Income Statement:")

print(report['income_statement'])

Example 2: Technical Documentation Parser

from docling.document_converter import DocumentConverter

def parse_technical_docs(doc_path):

    """Parse technical documentation."""

    converter = DocumentConverter()

    result = converter.convert(doc_path)

    doc = result.document

    documentation = {

        'title': None,

        'version': None,

        'sections': [],

        'code_blocks': [],

        'diagrams': []

    }

    current_section = None

    for element in doc.iterate_items():

        if element.type == 'title':

            documentation['title'] = element.text

        elif element.type == 'heading':

            current_section = {

                'title': element.text,

                'level': element.level if hasattr(element, 'level') else 1,

                'content': []

            }

            documentation['sections'].append(current_section)

        elif element.type == 'code':

            if current_section:

                current_section['content'].append({

                    'type': 'code',

                    'content': element.text

                })

            documentation['code_blocks'].append(element.text)

        elif element.type == 'picture':

            documentation['diagrams'].append({

                'page': element.prov[0].page_no if element.prov else None,

                'caption': element.caption if hasattr(element, 'caption') else None

            })

    return documentation

docs = parse_technical_docs('api_documentation.pdf')

print(f"Title: {docs['title']}")

print(f"Sections: {len(docs['sections'])}")

Example 3: Contract Analysis

from docling.document_converter import DocumentConverter

def analyze_contract(pdf_path):

    """Parse contract document for key clauses."""

    converter = DocumentConverter()

    result = converter.convert(pdf_path)

    doc = result.document

    contract = {

        'parties': [],

        'clauses': [],

        'dates': [],

        'amounts': [],

        'full_text': doc.export_to_text()

    }

    import re

    # Extract dates

    date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b'

    contract['dates'] = re.findall(date_pattern, contract['full_text'], re.IGNORECASE)

    # Extract monetary amounts

    amount_pattern = r'\$[\d,]+(?:\.\d{2})?|\b\d+(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|dollars)\b'

    contract['amounts'] = re.findall(amount_pattern, contract['full_text'], re.IGNORECASE)

    # Parse sections as clauses

    for element in doc.iterate_items():

        if element.type == 'heading':

            contract['clauses'].append({

                'title': element.text,

                'content': ''

            })

        elif element.type == 'paragraph' and contract['clauses']:

            contract['clauses'][-1]['content'] += element.text + '\n'

    return contract

contract_data = analyze_contract('agreement.pdf')

print(f"Key dates: {contract_data['dates']}")

print(f"Amounts: {contract_data['amounts']}")

Limitations

  • Very large documents may require chunking
  • Handwritten content needs OCR preprocessing
  • Complex nested tables may need manual review
  • Some PDF types (encrypted) not supported
  • GPU recommended for best performance

Installation

pip install docling

# For full functionality

pip install docling[all]

# For OCR support

pip install docling[ocr]

Resources

BrowserAct

Let your agent run on any real-world website

Bypass CAPTCHA & anti-bot for free. Start local, scale to cloud.

Explore BrowserAct Skills →

Stop writing automation&scrapers

Install the CLI. Run your first Skill in 30 seconds. Scale when you're ready.

Start free
free · no credit card