Knowledge Ingestion Patterns Skill

Systematic approaches for ingesting different content types into RAG with optimal chunking, metadata, and retrieval quality.

Overview

Different content types require different ingestion strategies. This skill documents best practices for:

Websites and web content
PDF documents
Code repositories
Conversation exports
Research notes
API documentation

Core Principles

Chunk for retrieval - Optimize chunk size for the questions you'll ask
Metadata matters - Rich metadata enables filtered search
Preserve context - Don't lose meaning when splitting
Deduplicate - Avoid ingesting the same content twice

Content Type Patterns

Pattern 2: PDF Documents

When to use: Research papers, reports, ebooks, scanned documents

Chunking Strategy: Page-aware with overlap, handle tables/figures specially

python

import fitz  # PyMuPDF
from typing import List, Dict

def chunk_pdf(pdf_path: str, chunk_size: int = 500) -> List[Dict]:
    """Extract and chunk PDF content with page awareness."""
    doc = fitz.open(pdf_path)
    chunks = []

    for page_num, page in enumerate(doc, 1):
        text = page.get_text()

        # Skip empty pages
        if not text.strip():
            continue

        # Split into paragraphs
        paragraphs = text.split('

')

        current_chunk = ""
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue

            if len(current_chunk) + len(para) < chunk_size:
                current_chunk += " " + para
            else:
                if current_chunk:
                    chunks.append({
                        "content": current_chunk.strip(),
                        "metadata": {
                            "type": "pdf",
                            "source": pdf_path,
                            "page": page_num,
                            "total_pages": len(doc)
                        }
                    })
                current_chunk = para

        # Don't forget last chunk of page
        if current_chunk:
            chunks.append({
                "content": current_chunk.strip(),
                "metadata": {
                    "type": "pdf",
                    "source": pdf_path,
                    "page": page_num,
                    "total_pages": len(doc)
                }
            })

    return chunks

def extract_pdf_tables(pdf_path: str) -> List[Dict]:
    """Extract tables from PDF as separate chunks."""
    import pdfplumber

    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            for table_num, table in enumerate(page.extract_tables(), 1):
                # Convert table to markdown format
                if table:
                    headers = table[0]
                    rows = table[1:]

                    md_table = "| " + " | ".join(str(h) for h in headers) + " |
"
                    md_table += "| " + " | ".join("---" for _ in headers) + " |
"
                    for row in rows:
                        md_table += "| " + " | ".join(str(c) for c in row) + " |
"

                    tables.append({
                        "content": md_table,
                        "metadata": {
                            "type": "pdf_table",
                            "source": pdf_path,
                            "page": page_num,
                            "table_number": table_num
                        }
                    })

    return tables

Metadata Schema:

yaml

type: pdf | pdf_table
source: file path
page: page number
total_pages: document length
table_number: (for tables) which table on page

Pattern 4: Websites / Web Content

When to use: Documentation sites, articles, blog posts

Chunking Strategy: Clean HTML, respect structure, handle navigation

python

import httpx
from bs4 import BeautifulSoup
from typing import List, Dict
from urllib.parse import urljoin, urlparse

def chunk_webpage(url: str) -> List[Dict]:
    """Fetch and chunk a webpage."""
    response = httpx.get(url, follow_redirects=True)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Remove noise
    for tag in soup.find_all(['nav', 'footer', 'aside', 'script', 'style']):
        tag.decompose()

    chunks = []

    # Find main content
    main = soup.find('main') or soup.find('article') or soup.find('body')

    # Chunk by sections
    for section in main.find_all(['section', 'div'], class_=lambda x: x and 'content' in str(x).lower()):
        text = section.get_text(separator=' ', strip=True)
        if len(text) > 100:  # Skip tiny sections
            chunks.append({
                "content": text,
                "metadata": {
                    "type": "webpage",
                    "source": url,
                    "domain": urlparse(url).netloc,
                    "title": soup.title.string if soup.title else ""
                }
            })

    # If no sections found, chunk the whole page
    if not chunks:
        text = main.get_text(separator=' ', strip=True)
        # Split into ~500 word chunks
        words = text.split()
        for i in range(0, len(words), 450):
            chunk_text = ' '.join(words[i:i+500])
            chunks.append({
                "content": chunk_text,
                "metadata": {
                    "type": "webpage",
                    "source": url,
                    "domain": urlparse(url).netloc,
                    "title": soup.title.string if soup.title else ""
                }
            })

    return chunks


async def crawl_site(start_url: str, max_pages: int = 50) -> List[Dict]:
    """Crawl a site and chunk all pages."""
    from urllib.parse import urlparse

    base_domain = urlparse(start_url).netloc
    visited = set()
    to_visit = [start_url]
    all_chunks = []

    async with httpx.AsyncClient() as client:
        while to_visit and len(visited) < max_pages:
            url = to_visit.pop(0)
            if url in visited:
                continue

            try:
                response = await client.get(url, follow_redirects=True)
                visited.add(url)

                # Chunk this page
                all_chunks.extend(chunk_webpage(url))

                # Find links to follow
                soup = BeautifulSoup(response.text, 'html.parser')
                for link in soup.find_all('a', href=True):
                    href = urljoin(url, link['href'])
                    if urlparse(href).netloc == base_domain and href not in visited:
                        to_visit.append(href)

            except Exception as e:
                print(f"Failed to fetch {url}: {e}")

    return all_chunks

Metadata Schema:

yaml

type: webpage
source: full URL
domain: domain name
title: page title
crawl_depth: (for crawls) how many links from start

Pattern 6: Research Notes

When to use: Personal notes, research findings, learnings

Chunking Strategy: By paragraph with topic extraction

python

from typing import List, Dict
from datetime import datetime

def chunk_research_notes(content: str, topic: str = None) -> List[Dict]:
    """Chunk research notes with topic awareness."""

    # Split by double newlines (paragraphs)
    paragraphs = [p.strip() for p in content.split('

') if p.strip()]

    chunks = []
    current_topic = topic or "general"

    for para in paragraphs:
        # Check if this is a topic header
        if para.startswith('#') or (len(para) < 50 and para.endswith(':')):
            current_topic = para.strip('#: ')
            continue

        chunks.append({
            "content": para,
            "metadata": {
                "type": "research",
                "topic": current_topic,
                "ingested_at": datetime.now().isoformat(),
                "word_count": len(para.split())
            }
        })

    return chunks


def chunk_with_source_attribution(
    content: str,
    source_url: str = None,
    source_title: str = None,
    researcher: str = None
) -> List[Dict]:
    """Chunk research with full source attribution."""

    chunks = chunk_research_notes(content)

    for chunk in chunks:
        chunk["metadata"].update({
            "source_url": source_url,
            "source_title": source_title,
            "researcher": researcher
        })

    return chunks

Metadata Schema:

yaml

type: research
topic: extracted or assigned topic
source_url: where the info came from
source_title: title of source
researcher: who did the research
ingested_at: timestamp
word_count: chunk size

knowledge-ingestion-patterns

NPX Install

Tags

SKILL.md Content

Knowledge Ingestion Patterns Skill

Overview

Core Principles

Content Type Patterns

Pattern 2: PDF Documents

Pattern 4: Websites / Web Content

Pattern 6: Research Notes