Loading...
Loading...
Use when adding multi-format RAG ingest, chunk, embed, and retrieval pipelines; pair with architect-python-uv-batch or architect-python-uv-fastapi-sqlalchemy.
npx skill4agent add ajrlewis/ai-skills addon-rag-ingestion-pipelinearchitect-python-uv-batcharchitect-python-uv-fastapi-sqlalchemySOURCE_FORMATSpdfmarkdowntxthtmlcsvEMBED_PROVIDERopenaisentence-transformersVECTOR_STOREpgvectorchromaCHUNK_SIZE1000CHUNK_OVERLAP150TOP_K5uv add pypdf markdown-it-py beautifulsoup4 pandas langchain-text-splittersEMBED_PROVIDER=openaiuv add openaiEMBED_PROVIDER=sentence-transformersuv add sentence-transformersVECTOR_STORE=chromauv add chromadbsrc/{{MODULE_NAME}}/rag/
loaders/pdf_loader.py
loaders/markdown_loader.py
loaders/text_loader.py
loaders/html_loader.py
loaders/csv_loader.py
normalize.py
chunking.py
embeddings.py
indexer.py
retriever.pydocument_idsource_pathsource_typecontentmetadatauv run {{PROJECT_NAME}} rag-ingest --source ./data/inbox --formats pdf,markdown,txtuv run {{PROJECT_NAME}} rag-query --q "question" --top-k 5rag-queryrag-ingestpage_numberutf-8latin-1normalize.pyimport re
import unicodedata
def normalize_text(raw: str) -> str:
text = unicodedata.normalize("NFKC", raw)
text = text.replace("\r\n", "\n")
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()chunking.pyfrom langchain_text_splitters import RecursiveCharacterTextSplitter
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 150) -> list[str]:
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
separators=["\n\n", "\n", ". ", " ", ""],
)
return splitter.split_text(text)source_pathuv run {{PROJECT_NAME}} rag-ingest --source ./data/inbox --formats pdf,markdown
uv run {{PROJECT_NAME}} rag-query --q "smoke test" --top-k 5
uv run pytest -q