Loading...
Loading...
State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX. Provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio. The industry standard for Large Language Models (LLMs) and foundation models in science.
npx skill4agent add tondevrel/scientific-agent-skills transformerspipelineAutoModelAutoTokenizerTrainerPEFTAutoModelAutoTokenizer[CLS][SEP]pipe(data)pip install transformers datasets tokenizers
# Requires a backend (PyTorch or JAX)
pip install torchfrom transformers import pipeline, AutoModel, AutoTokenizer, TrainingArguments, Trainer
import torchfrom transformers import pipeline
# 1. Initialize a pipeline (automatically downloads model)
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
# 2. Run inference
results = classifier("The molecular structure of this compound is fascinating.")
print(results)AutoTokenizer.from_pretrained()AutoModel.from_pretrained()device=0device="mps"HF_HOMEtruncation=Truedatasetspadding=Truefrom transformers import AutoModel, AutoTokenizer
# ❌ BAD: Re-initializing the model inside a function called frequently
def get_prediction(text):
model = AutoModel.from_pretrained("bert-base-uncased") # ❌ SLOW & RAM HEAVY
return model(text)
# ✅ GOOD: Load once globally or in a class
model = AutoModel.from_pretrained("bert-base-uncased")
def get_prediction(text):
return model(text)
# ❌ BAD: Manual string splitting for "tokens"
# tokens = text.split(" ") # ❌ Not compatible with model vocabulary
# ✅ GOOD: Use the model's specific tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(text, return_tensors="pt")
# ❌ BAD: Forgetting to move model to GPU
# model = AutoModel.from_pretrained("...")
# output = model(inputs.to("cuda")) # ❌ Error: Model is on CPU!tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
texts = ["Science is cool.", "Quantum physics is hard."]
# Batch encoding
inputs = tokenizer(
texts,
padding=True,
truncation=True,
max_length=128,
return_tensors="pt" # Returns PyTorch tensors
)
print(inputs['input_ids'].shape) # (batch_size, seq_len)from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
evaluation_strategy="epoch",
logging_dir="./logs",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
)
# trainer.train()# ESM-2 is a powerful protein language model
tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D")
protein_seq = "MAPLRKTYLLG"
inputs = tokenizer(protein_seq, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
# The 'last_hidden_state' represents a "feature vector" for each amino acid
embeddings = outputs.last_hidden_state# Using a model trained on molecular strings
pipe = pipeline("text-classification", model="seyonec/ChemBERTa-zinc-base-v1")
smiles = "CC(=O)Oc1ccccc1C(=O)O" # Aspirin
result = pipe(smiles)
print(f"Prediction: {result}")# Extracting genes, proteins, or chemicals from text
ner_pipe = pipeline("ner", model="dslim/bert-base-NER")
text = "The expression of the BRCA1 gene was observed in the sample."
entities = ner_pipe(text)from transformers import BitsAndBytesConfig
# Load model in 4-bit precision
quant_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModel.from_pretrained("model_name", quantization_config=quant_config)# 'device=0' targets the first CUDA device
pipe = pipeline("translation_en_to_fr", model="t5-base", device=0)# ❌ Problem: Batch size is too large for GPU RAM
# ✅ Solution:
# 1. Reduce 'per_device_train_batch_size'
# 2. Use 'gradient_accumulation_steps' to keep effective batch size
# 3. Use 'fp16=True' in TrainingArguments# ❌ Problem: outputs[0] works, but is confusing
# ✅ Solution: Access by name
outputs = model(**inputs)
hidden_states = outputs.last_hidden_state# ✅ Solution: Use "Fast" tokenizers (written in Rust, usually default)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)