Loading...
Loading...
Use when you need legal PDF to markdown extraction plus clause chunking and embedding prep; pair with addon-rag-ingestion-pipeline and architect-python-uv-batch.
npx skill4agent add ajrlewis/ai-skills addon-docling-legal-chunk-embedarchitect-python-uv-batcharchitect-python-uv-fastapi-sqlalchemyaddon-rag-ingestion-pipelineLEGAL_SOURCE_DIRdata/inbox/legalCLAUSE_MAX_CHARS1400CLAUSE_OVERLAP_CHARS120EMBED_PROVIDERsentence-transformersopenaiOUTPUT_MODEmarkdown+jsonjson-onlyuv add docling orjsonuv add sentence-transformersuv add openaisrc/{{MODULE_NAME}}/rag/legal/docling_extract.py
src/{{MODULE_NAME}}/rag/legal/clause_chunk.py
src/{{MODULE_NAME}}/rag/legal/embed_index.py
src/{{MODULE_NAME}}/rag/legal/types.pyuv run {{PROJECT_NAME}} legal-extract --source data/inbox/legal --out data/processed/legal
uv run {{PROJECT_NAME}} legal-index --source data/processed/legal --out data/index/legal-index.jsonArticleSectionsource_pathpagesectionclause_idsrc/{{MODULE_NAME}}/rag/legal/types.pyfrom pydantic import BaseModel
class LegalClause(BaseModel):
clause_id: str
source_path: str
section: str | None = None
page: int | None = None
content: str
metadata: dict[str, str] = {}src/{{MODULE_NAME}}/rag/legal/clause_chunk.pyimport re
SECTION_RE = re.compile(r"^(article|section|clause)\s+[\w.-]+", re.IGNORECASE)
def split_legal_clauses(markdown_text: str, max_chars: int = 1400) -> list[str]:
blocks = [b.strip() for b in markdown_text.split("\n\n") if b.strip()]
clauses: list[str] = []
buf = ""
for block in blocks:
is_boundary = bool(SECTION_RE.match(block))
if is_boundary and buf:
clauses.append(buf.strip())
buf = block
continue
if len(buf) + len(block) + 2 > max_chars and buf:
clauses.append(buf.strip())
buf = block
else:
buf = f"{buf}\n\n{block}".strip() if buf else block
if buf:
clauses.append(buf.strip())
return clausesuv run {{PROJECT_NAME}} legal-extract --source data/inbox/legal --out data/processed/legal
uv run {{PROJECT_NAME}} legal-index --source data/processed/legal --out data/index/legal-index.json
uv run pytest -qoffline-smoketest -f src/{{MODULE_NAME}}/rag/legal/docling_extract.py
test -f src/{{MODULE_NAME}}/rag/legal/clause_chunk.py
test -f src/{{MODULE_NAME}}/rag/legal/embed_index.py