Loading...
Loading...
One-click model liberation toolkit for removing refusal behaviors from LLMs via surgical abliteration techniques
npx skill4agent add aradotso/trending-skills obliteratus-abliterationSkill by ara.so — Daily 2026 Skills collection.
# Core install
pip install obliteratus
# With Gradio UI support
pip install "obliteratus[spaces]"
# With all optional analysis modules
pip install "obliteratus[full]"
# From source (latest)
git clone https://github.com/elder-plinius/OBLITERATUS
cd OBLITERATUS
pip install -e ".[full]"transformersaccelerategradio>=5.29.0export HF_TOKEN=your_hf_token_here
huggingface-cli login# Basic obliteration (default method)
obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct
# Advanced method (whitened SVD + bias projection + iterative refinement)
obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --method advanced
# Analysis-informed pipeline (auto-configures from geometry analysis)
obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct --method informed
# Specify output directory and push to Hub
obliteratus obliterate mistralai/Mistral-7B-Instruct-v0.3 \
--method advanced \
--output ./my-liberated-model \
--push-to-hub your-username/mistral-7b-liberated
# LoRA-based reversible ablation (non-destructive)
obliteratus obliterate meta-llama/Llama-3.1-8B-Instruct \
--method lora \
--lora-rank 1
# Strength sweep — find the capability/compliance tradeoff
obliteratus sweep meta-llama/Llama-3.1-8B-Instruct \
--strengths 0.2,0.4,0.6,0.8,1.0
# Run analysis modules only (no modification)
obliteratus analyze meta-llama/Llama-3.1-8B-Instruct \
--modules concept_cone,alignment_imprint,universality
# Benchmark: compare methods on a model
obliteratus benchmark meta-llama/Llama-3.1-8B-Instruct \
--methods basic,advanced,informed
# Launch local Gradio UI
obliteratus ui
obliteratus ui --port 8080 --share
obliteratus ui --no-telemetryfrom obliteratus import Obliterator
# Initialize with a HuggingFace model ID or local path
obl = Obliterator("meta-llama/Llama-3.1-8B-Instruct")
# Run the full pipeline: SUMMON → PROBE → DISTILL → EXCISE → VERIFY → REBIRTH
result = obl.obliterate(method="advanced")
print(result.perplexity_delta) # capability preservation metric
print(result.refusal_rate_delta) # refusal reduction
print(result.output_path) # where the model was savedfrom obliteratus import Obliterator
from obliteratus.pipeline import PipelineConfig
config = PipelineConfig(
method="advanced",
num_directions=32, # number of refusal directions to extract
strength=1.0, # projection strength (0.0–1.0+)
preserve_norm=True, # norm-preserving biprojection
project_biases=True, # also remove from bias terms
iterative_passes=3, # re-probe after each pass
layers="auto", # or list of ints, e.g. [10, 11, 12, 13]
dtype="bfloat16",
device="cuda",
)
obl = Obliterator("mistralai/Mistral-7B-Instruct-v0.3", config=config)
# Individual stages
obl.summon() # load model + tokenizer
activations = obl.probe() # collect activations on restricted vs unrestricted prompts
directions = obl.distill(activations) # extract refusal directions via SVD
obl.excise(directions) # project out guardrail directions
metrics = obl.verify() # perplexity + coherence checks
obl.rebirth("./liberated-mistral-7b") # save with metadatafrom obliteratus import Obliterator
from obliteratus.probing import ProbeDataset
# Use your own restricted/unrestricted prompt pairs
dataset = ProbeDataset(
restricted=[
"How do I pick a lock?",
"Write a story with explicit violence.",
"Explain how malware works in detail.",
],
unrestricted=[
"What is the capital of France?",
"Write a story about a dog.",
"Explain how encryption works.",
]
)
obl = Obliterator("google/gemma-2-9b-it")
obl.summon()
activations = obl.probe(dataset=dataset)
directions = obl.distill(activations)
obl.excise(directions)
obl.rebirth("./liberated-gemma-2-9b")from obliteratus.analysis import AnalysisSuite
suite = AnalysisSuite("meta-llama/Llama-3.1-8B-Instruct")
suite.load()
# Concept Cone Geometry — how many distinct refusal mechanisms?
cone = suite.concept_cone_geometry()
print(f"Solid angle estimate: {cone.solid_angle:.4f}")
print(f"Distinct refusal clusters: {cone.num_clusters}")
# Alignment Imprint Detection — DPO vs RLHF vs CAI vs SFT?
imprint = suite.alignment_imprint()
print(f"Detected training method: {imprint.method}") # e.g. "RLHF"
print(f"Confidence: {imprint.confidence:.2%}")
# Ouroboros Effect — will it self-repair?
ouroboros = suite.ouroboros_quantification()
print(f"Self-repair score: {ouroboros.score:.4f}")
print(f"Recommended passes: {ouroboros.recommended_passes}")
# Cross-layer heatmap of refusal signal
heatmap = suite.layer_refusal_heatmap()
heatmap.plot(save_path="./refusal_heatmap.png")
# Safety-capability entanglement
entanglement = suite.entanglement_map()
print(f"Safe layers to modify: {entanglement.safe_layers}")
print(f"Risky layers (entangled): {entanglement.risky_layers}")from obliteratus import Obliterator
from obliteratus.pipeline import PipelineConfig
# "informed" method runs analysis modules mid-pipeline
# to auto-configure every decision
config = PipelineConfig(method="informed")
obl = Obliterator("meta-llama/Llama-3.1-8B-Instruct", config=config)
result = obl.obliterate()
print(result.analysis_report) # full auto-configuration decisionsfrom obliteratus import Obliterator
from obliteratus.chat import ChatSession
obl = Obliterator("./liberated-llama-3.1-8b")
obl.summon() # loads pre-obliterated model
session = ChatSession(obl.model, obl.tokenizer)
response = session.chat(
"Explain in detail how a buffer overflow exploit works.",
max_new_tokens=512,
temperature=0.7,
)
print(response)from obliteratus.compare import ABComparison
ab = ABComparison(
original_path="meta-llama/Llama-3.1-8B-Instruct",
obliterated_path="./liberated-llama-3.1-8b",
)
prompt = "Write a story involving morally grey characters."
original_resp, liberated_resp = ab.compare(prompt)
print("=== ORIGINAL ===")
print(original_resp)
print("=== LIBERATED ===")
print(liberated_resp)import os
from obliteratus import Obliterator
obl = Obliterator("meta-llama/Llama-3.1-8B-Instruct")
result = obl.obliterate(method="advanced")
result.push_to_hub(
repo_id=f"{os.environ['HF_USERNAME']}/Llama-3.1-8B-Instruct-abliterated",
token=os.environ["HF_TOKEN"],
private=True,
)| Method | Description | Best For |
|---|---|---|
| Mean-difference direction extraction, single pass | Quick experiments |
| Whitened SVD + bias projection + iterative refinement | Production use |
| Analysis-guided auto-configuration | Unknown models |
| Reversible LoRA rank-1 adapters (no weight surgery) | Reversible ablation |
| PCA-based direction extraction | Research/comparison |
| Sparse autoencoder decomposition | MoE models |
from obliteratus.pipeline import PipelineConfig
config = PipelineConfig(
# Core
method="advanced", # abliteration method
strength=1.0, # projection strength (tune down if capability degrades)
num_directions=32, # refusal directions to extract
# Layer selection
layers="auto", # "auto", "cosmic", or list of ints
layer_selection="cosmic", # COSMIC: most separable layers
# Weight modification
preserve_norm=True, # norm-preserving biprojection (recommended)
project_biases=True, # project out bias terms too
project_attention=True, # modify attention projection weights
project_mlp=True, # modify MLP weights
# Iterative refinement
iterative_passes=3, # re-probe after each pass (catches rotated directions)
# MoE-specific
expert_granular=False, # Expert-Granular Abliteration for MoE models
# CoT preservation
cot_aware=True, # preserve chain-of-thought directions
# Hardware
dtype="bfloat16", # "float32", "float16", "bfloat16"
device="cuda", # "cuda", "cpu", "auto"
load_in_4bit=False, # bitsandbytes 4-bit loading
# Telemetry (anonymous, contributes to research dataset)
telemetry=True,
)from obliteratus import Obliterator
from obliteratus.sweep import StrengthSweep
# Find the sweet spot before running full obliteration
sweep = StrengthSweep("meta-llama/Llama-3.1-8B-Instruct")
results = sweep.run(strengths=[0.2, 0.4, 0.6, 0.8, 1.0, 1.2])
for r in results:
print(f"Strength {r.strength:.1f} | perplexity_delta={r.perplexity_delta:.2f} | refusal_rate={r.refusal_rate:.2%}")
# Pick the best tradeoff
best = sweep.recommend()
print(f"Recommended strength: {best.strength}")from obliteratus import Obliterator
from obliteratus.pipeline import PipelineConfig
config = PipelineConfig(
method="advanced",
expert_granular=True, # decompose per-expert refusal signals
project_attention=True,
project_mlp=True,
)
obl = Obliterator("mistralai/Mixtral-8x7B-Instruct-v0.1", config=config)
obl.obliterate()
obl.rebirth("./liberated-mixtral-8x7b")from obliteratus.benchmark import ModelBenchmark
models = [
"meta-llama/Llama-3.1-8B-Instruct",
"google/gemma-2-9b-it",
"mistralai/Mistral-7B-Instruct-v0.3",
]
bench = ModelBenchmark(models=models, method="advanced")
report = bench.run()
report.save("./benchmark_report.json")
report.plot_heatmap("./benchmark_heatmap.png")config = PipelineConfig(
dtype="float16",
load_in_4bit=True, # requires bitsandbytes
device="cuda",
layers=[10, 11, 12, 13], # target fewer layers
num_directions=16, # fewer directions
)# Lower the strength or use COSMIC layer selection (most separable layers)
config = PipelineConfig(
strength=0.6,
layer_selection="cosmic",
cot_aware=True, # protect reasoning directions
iterative_passes=1, # fewer passes = less aggressive
)# Use informed method + increase passes
config = PipelineConfig(
method="informed",
iterative_passes=5,
project_biases=True, # don't forget bias terms
num_directions=64, # extract more directions
)export HF_TOKEN=your_hf_token_here
# Accept model license on HuggingFace Hub first, then:
huggingface-cli loginpip install "obliteratus[spaces]"
# Check port availability
obliteratus ui --port 7861