Loading...
Loading...
This skill provides semantic search capabilities using embedding-based similarity matching for code and text. Enables meaning-based search beyond keyword matching, with optional document parsing (PDF, DOCX, PPTX) support.
npx skill4agent add massgen/massgen semtoolsexecute_commandsearchworkspaceparsesearch# Basic semantic search
search "authentication logic" src/
# Search with more context (5 lines before/after)
search "error handling" --n-lines 5 src/
# Get more results (default: 3)
search "database queries" --top-k 10 src/
# Control similarity threshold (0.0-1.0, lower = more lenient)
search "API endpoints" --max-distance 0.4 src/--n-lines N--top-k K--max-distance D-iMatch 1 (similarity: 0.12)
File: src/auth/handlers.py
Lines: 42-47
----
def authenticate_user(username: str, password: str) -> Optional[User]:
"""Authenticate user credentials against database."""
user = get_user_by_username(username)
if user and verify_password(password, user.password_hash):
return user
return None
----
Match 2 (similarity: 0.18)
File: src/middleware/auth.py
...workspace# Create/activate workspace
workspace use my-project
# Set workspace via environment variable
export SEMTOOLS_WORKSPACE=my-project
# Index files in workspace (workspace auto-detected from env var)
search "query" src/
# Check workspace status
workspace status
# Clean up old workspaces
workspace pruneparse# Parse PDFs to markdown
parse research_papers/*.pdf
# Parse Word documents
parse reports/*.docx
# Parse presentations
parse slides/*.pptx
# Parse and pipe to search
parse docs/*.pdf | xargs search "neural networks"# Via environment variable
export LLAMA_CLOUD_API_KEY="llx-..."
# Via config file
cat > ~/.parse_config.json << EOF
{
"api_key": "llx-...",
"max_concurrent_requests": 10,
"timeout_seconds": 3600
}
EOF# Step 1: Broad semantic search
search "rate limiting implementation" src/
# Step 2: Review results, refine query
search "throttle requests per user" src/ --top-k 10
# Step 3: Use ripgrep for exact follow-up
rg "RateLimiter" --type py src/# Step 1: Extract key concepts from reference code
# [Read example_auth.py and identify key concepts]
# Step 2: Search for similar implementations
search "user authentication with JWT tokens" src/
# Step 3: Compare implementations
# [Review semantic matches to find similar approaches]# Search code comments semantically
search "thread safety guarantees" src/ --n-lines 10
# Search markdown documentation
search "deployment best practices" docs/
# Combined search
search "performance optimization" --top-k 20# Semantic search works across languages
search "connection pooling" src/
# May find:
# - Java: "ConnectionPool manager"
# - Python: "database connection reuse"
# - Go: "pool of persistent connections"
# All semantically related despite different terminology# Step 1: Parse documents to markdown
parse research/*.pdf > papers.md
# Step 2: Search converted content
search "transformer architecture" papers.md
# Step 3: Combine with code search
search "attention mechanism implementation" src/| You Know | Use First | Then Use | Why |
|---|---|---|---|
| Exact keywords | ripgrep | search | Fast exact match, then find similar |
| Concept only | search | ripgrep | Find relevant code, then search specifics |
| Function name | ripgrep | search | Find definition, then find similar usage |
| Code pattern | ast-grep | search | Find structure, then find similar logic |
| Approximate idea | search | ripgrep + ast-grep | Discover, then drill down |
# Layer 1: Semantic discovery (what's related?)
search "user session management" --top-k 10
# Layer 2: Exact text search (what's the implementation?)
rg "SessionManager|session_store" --type py
# Layer 3: Structural search (how is it used?)
sg --pattern 'session.$METHOD($$$)' --lang python
# Layer 4: Reference tracking (where is it called?)
# [Use serena skill for symbol-level tracking]# GOOD: Broad semantic discovery first
search "authentication" src/ --top-k 10
# [Review results to learn terminology]
rg "authenticate|verify_credentials" --type py src/
# AVOID: Starting too narrow and missing variations
rg "authenticate" --type py # Misses "verify_credentials", "check_auth", etc.--max-distance# Too many irrelevant results? Decrease distance (more strict)
search "query" --max-distance 0.2
# Missing relevant results? Increase distance (more lenient)
search "query" --max-distance 0.5
# Default (0.3) works well for most cases
search "query"# GOOD: Create workspace once, search many times
export SEMTOOLS_WORKSPACE=my-analysis
search "concept1" src/
search "concept2" src/
search "concept3" src/
# INEFFICIENT: Re-compute embeddings every time
search "concept1" src/
search "concept2" src/# Find semantically similar code
search "retry logic" src/ --n-lines 2
# Get more context with ripgrep
rg -C 10 "retry" src/specific_file.py
# Or read the full file
cat src/specific_file.py# GOOD: Conceptual queries
search "handling network timeouts"
search "user input validation"
search "concurrent data access"
# LESS EFFECTIVE: Exact keyword queries (use ripgrep instead)
search "timeout" # Use: rg "timeout"
search "validate" # Use: rg "validate"# Strict matching (only close matches)
--max-distance 0.2
# Balanced matching (default, recommended)
--max-distance 0.3
# Lenient matching (exploratory search)
--max-distance 0.4
# Very lenient (may include false positives)
--max-distance 0.5# 1. Use workspaces for repeated searches
export SEMTOOLS_WORKSPACE=my-project
# 2. Limit search scope to relevant directories
search "query" src/ --not tests/
# 3. Use --top-k to control result count
search "query" --top-k 5
# 4. Pipe to head for quick preview
search "query" | head -50# Find and parse PDFs, then search
find docs/ -name "*.pdf" | xargs parse | xargs search "topic"
# Search and filter with grep
search "authentication" src/ | grep -i "jwt"
# Count matches
search "error handling" src/ | grep "Match" | wc -l
# Combine with other tools
search "API" src/ | xargs -I {} rg -l "REST" {}# WRONG TOOL: Semantic search for exact function name
search "authenticate_user"
# RIGHT TOOL: Use ripgrep for exact matches
rg "authenticate_user" --type py# WRONG TOOL: Semantic search for code structure
search "class with constructor"
# RIGHT TOOL: Use ast-grep for structure
sg --pattern 'class $NAME { constructor($$$) { $$$ } }'# WRONG TOOL: Semantic search for all usages
search "MyClass usage"
# RIGHT TOOL: Use serena for precise references
serena find_referencing_symbols --name 'MyClass'rg "concept" src/search "query" --max-distance 0.5search "user authentication"
search "verify user credentials"
search "login validation"search "query" src/*.py # Target specific typessearch "query" --max-distance 0.2search "query" --top-k 3search "query" src/specific_module/# Vague
search "data"
# Specific
search "data validation with regex patterns"parseecho $LLAMA_CLOUD_API_KEYfile document.pdf # Verify file typedu -h document.pdf # Check sizecat ~/.parse_config.json# Check workspace status
workspace status
# Prune corrupted workspaces
workspace prune
# Recreate workspace
rm -rf ~/.semtools/workspaces/my-workspace
export SEMTOOLS_WORKSPACE=my-workspace