Loading...
Loading...
Detect and mask PII (names, emails, phones, SSN, addresses) in text and CSV files. Multiple masking strategies with reversible tokenization option.
npx skill4agent add dkyazzentwatwa/chatgpt-skills data-anonymizerfrom scripts.data_anonymizer import DataAnonymizer
# Anonymize text
anonymizer = DataAnonymizer()
result = anonymizer.anonymize("Contact John Smith at john@email.com or 555-123-4567")
print(result)
# "Contact [NAME] at [EMAIL] or [PHONE]"
# Anonymize CSV
anonymizer.anonymize_csv("customers.csv", "customers_anon.csv")anonymizer = DataAnonymizer(
strategy="mask", # mask, redact, hash, fake
reversible=False # Enable token mapping
)# Basic anonymization
result = anonymizer.anonymize(text)
# With specific PII types
result = anonymizer.anonymize(text, pii_types=["email", "phone"])
# Get detected PII report
result, report = anonymizer.anonymize(text, return_report=True)text = "Email john@test.com, call 555-1234"
# Mask (default) - replace with type labels
anonymizer.strategy = "mask"
# "Email [EMAIL], call [PHONE]"
# Redact - replace with asterisks
anonymizer.strategy = "redact"
# "Email ***************, call ********"
# Hash - replace with hash
anonymizer.strategy = "hash"
# "Email a1b2c3d4, call e5f6g7h8"
# Fake - replace with realistic fake data
anonymizer.strategy = "fake"
# "Email jane@example.org, call 555-9876"# Auto-detect PII columns
anonymizer.anonymize_csv("input.csv", "output.csv")
# Specify columns
anonymizer.anonymize_csv(
"input.csv",
"output.csv",
columns=["name", "email", "phone"]
)
# Different strategies per column
anonymizer.anonymize_csv(
"input.csv",
"output.csv",
column_strategies={
"name": "fake",
"email": "hash",
"ssn": "redact"
}
)anonymizer = DataAnonymizer(reversible=True)
# Anonymize with token mapping
result = anonymizer.anonymize("John Smith: john@test.com")
mapping = anonymizer.get_mapping()
# Save mapping securely
anonymizer.save_mapping("mapping.json", encrypt=True, password="secret")
# Later, de-anonymize
anonymizer.load_mapping("mapping.json", password="secret")
original = anonymizer.deanonymize(result)# Add custom PII pattern
anonymizer.add_pattern(
name="employee_id",
pattern=r"EMP-\d{6}",
label="[EMPLOYEE_ID]"
)# Anonymize text file
python data_anonymizer.py --input document.txt --output document_anon.txt
# Anonymize CSV
python data_anonymizer.py --input customers.csv --output customers_anon.csv
# Specific strategy
python data_anonymizer.py --input data.csv --output anon.csv --strategy fake
# Generate audit report
python data_anonymizer.py --input document.txt --report audit.json
# Specific PII types only
python data_anonymizer.py --input doc.txt --types email phone ssn| Argument | Description | Default |
|---|---|---|
| Input file | Required |
| Output file | Required |
| Masking strategy | mask |
| PII types to detect | all |
| CSV columns to process | auto |
| Generate audit report | - |
| Enable token mapping | False |
| Type | Examples | Pattern |
|---|---|---|
| John Smith, Mary Johnson | NLP-based |
| user@domain.com | Regex |
| 555-123-4567, (555) 123-4567 | Regex |
| 123-45-6789 | Regex |
| 4111-1111-1111-1111 | Regex + Luhn |
| 123 Main St, City, ST 12345 | NLP + Regex |
| 01/15/1990, January 15, 1990 | Regex |
| 192.168.1.1 | Regex |
anonymizer = DataAnonymizer(strategy="mask")
log = """
Ticket #1234: Customer John Doe (john.doe@company.com) called about
billing issue. SSN on file: 123-45-6789. Callback number: 555-867-5309.
Address: 123 Oak Street, Springfield, IL 62701.
"""
result = anonymizer.anonymize(log)
print(result)
# Ticket #1234: Customer [NAME] ([EMAIL]) called about
# billing issue. SSN on file: [SSN]. Callback number: [PHONE].
# Address: [ADDRESS].anonymizer = DataAnonymizer(strategy="hash")
# Consistent hashing for joins
anonymizer.anonymize_csv(
"users.csv",
"users_anon.csv",
columns=["email", "name", "phone"]
)
anonymizer.anonymize_csv(
"orders.csv",
"orders_anon.csv",
columns=["customer_email"] # Same hash as users.email
)anonymizer = DataAnonymizer(strategy="fake")
# Replace real PII with realistic fake data
anonymizer.anonymize_csv(
"production_data.csv",
"test_data.csv"
)
# Test data has same structure but fake PIIpandas>=2.0.0
faker>=18.0.0