Loading...
Loading...
OpenAI Privacy Filter — bidirectional token-classification model for PII detection and masking in text
npx skill4agent add aradotso/trending-skills openai-privacy-filterSkill by ara.so — Daily 2026 Skills collection.
pip install -e .
# or from a cloned repo:
git clone https://github.com/openai/privacy-filter
cd privacy-filter
pip install -e .opf~/.opf/privacy_filterOPF_CHECKPOINTexport OPF_CHECKPOINT=/path/to/local/checkpoint_dir| Label | Description |
|---|---|
| Bank/card/account numbers |
| Physical addresses |
| Email addresses |
| Personal names |
| Phone numbers |
| Personal URLs |
| Dates of birth / personal dates |
| Credentials, tokens, API keys |
# Redact inline text
opf "Alice was born on 1990-01-02 and her email is alice@example.com."
# Force CPU inference
opf --device cpu "Alice was born on 1990-01-02."
# Use a specific checkpoint
opf --checkpoint /path/to/checkpoint_dir "Alice Johnson, SSN 123-45-6789"
# Redact an entire file
opf -f /path/to/document.txt
# Pipe input
cat document.txt | grep "sensitive" | opf
# Interactive mode (no input provided)
opf# Evaluate on a labeled JSONL dataset
opf eval examples/data/sample_eval_five_examples.jsonl
# See all eval options
opf eval --help# Finetune on your labeled dataset
opf train /path/to/train.jsonl --output-dir /path/to/finetuned_checkpoint
# See all training options
opf train --helpfrom opf import PrivacyFilter
# Load with default checkpoint (~/.opf/privacy_filter or OPF_CHECKPOINT)
pf = PrivacyFilter()
# Or specify a checkpoint explicitly
pf = PrivacyFilter(checkpoint="/path/to/checkpoint_dir")
# Redact a single string
result = pf.redact("Alice Johnson called from +1-800-555-0199.")
print(result.redacted_text)
# "██████████████ called from ██████████████."
# Access detected spans
for span in result.spans:
print(span.label, span.text, span.start, span.end)from opf import PrivacyFilter
pf = PrivacyFilter(device="cuda") # or "cpu"
texts = [
"Contact Bob Smith at bob@example.com",
"Her SSN is 123-45-6789 and DOB is 1985-03-15",
"API key: sk-abc123xyz789",
]
results = pf.redact_batch(texts)
for r in results:
print(r.redacted_text)
print(r.spans)from opf import PrivacyFilter
# High recall (broader masking, more false positives)
pf_recall = PrivacyFilter(operating_point="high_recall")
# High precision (stricter masking, fewer false positives)
pf_precision = PrivacyFilter(operating_point="high_precision")
# Default balanced
pf_default = PrivacyFilter(){"text": "Alice was born on 1990-01-02.", "spans": [{"start": 0, "end": 5, "label": "private_person"}, {"start": 18, "end": 28, "label": "private_date"}]}
{"text": "Email bob@corp.com for details.", "spans": [{"start": 6, "end": 18, "label": "private_email"}]}{
"redacted_text": "██████ was born on ██████████.",
"spans": [
{
"label": "private_person",
"text": "Alice",
"start": 0,
"end": 5,
"score": 0.987
},
{
"label": "private_date",
"text": "1990-01-02",
"start": 18,
"end": 28,
"score": 0.973
}
]
}OUTPUT_SCHEMAS.md# Prepare labeled JSONL (see data format above)
# Run finetuning
opf train train.jsonl \
--output-dir ./my_finetuned_model \
--eval-file eval.jsonl \
--epochs 3 \
--batch-size 8
# Use the finetuned model
opf --checkpoint ./my_finetuned_model "redact this text"FINETUNING.mdexamples/scripts/finetuning/| Variable | Purpose |
|---|---|
| Path to model checkpoint directory (overrides default |
opf/
├── __main__.py # CLI entrypoint (redact, eval, train)
├── _api.py # Python-facing API
├── _cli/ # Argument parsing, terminal rendering
├── _core/ # Runtime loading, span conversion, decoding
├── _eval/ # Dataset loading, metrics, eval runners
├── _train/ # Finetuning argument parsing and runners
├── _model/ # Transformer impl, checkpoint config, weight loading
examples/
├── data/ # Sample eval/finetune JSONL fixtures
├── scripts/finetuning/ # Runnable finetuning demo scriptsfrom opf import PrivacyFilter
import json
pf = PrivacyFilter()
def sanitize_for_llm(raw_text: str) -> str:
result = pf.redact(raw_text)
return result.redacted_text
with open("raw_data.txt") as f:
clean = sanitize_for_llm(f.read())
print(clean)from opf import PrivacyFilter
pf = PrivacyFilter()
def audit_pii(text: str) -> list[dict]:
result = pf.redact(text)
return [
{"label": s.label, "text": s.text, "start": s.start, "end": s.end}
for s in result.spans
]
findings = audit_pii("Bob Jones (DOB: 1978-06-15) owes $1,200.")
print(json.dumps(findings, indent=2))from opf import PrivacyFilter
pf = PrivacyFilter()
def redact_only(text: str, labels: list[str]) -> str:
result = pf.redact(text)
# Rebuild text redacting only chosen labels
chars = list(text)
for span in result.spans:
if span.label in labels:
for i in range(span.start, span.end):
chars[i] = "█"
return "".join(chars)
# Only redact emails and phones, keep names
output = redact_only(
"Call Alice at 555-1234 or alice@example.com",
labels=["private_phone", "private_email"]
)
print(output)
# "Call Alice at ████████ or █████████████████"OPF_CHECKPOINT--device cpu--batch-size 1opf trainoperating_point="high_recall"FINETUNING.mdOUTPUT_SCHEMAS.mdEVAL_AND_OUTPUT_MODES.md