Loading...
Loading...
Library for bioinformatics and community ecology statistics. Provides data structures and algorithms for sequences, alignments, phylogenetics, and diversity analysis. Essential for microbiome research and ecological data science. Use for alpha/beta diversity metrics, ordination (PCoA), phylogenetic trees, sequence manipulation (DNA/RNA/Protein), distance matrices, PERMANOVA, and community ecology analysis.
npx skill4agent add tondevrel/scientific-agent-skills scikit-bioskbio.sequenceskbio.stats.distanceskbio.diversityskbio.stats.ordinationpip install scikit-bioimport skbio
import numpy as np
import pandas as pd
from skbio import DNA, RNA, Protein, Sequence
from skbio.stats.distance import DistanceMatrix
from skbio.diversity import alpha, beta
from skbio.stats.ordination import pcoafrom skbio import DNA
# 1. Create a sequence with metadata
seq = DNA("ACC--GTT", metadata={'id': 'sample1', 'desc': 'gene A'})
# 2. Biological operations
rc = seq.reverse_complement()
degapped = seq.degap()
# 3. Validation
print(f"Is valid? {seq.is_valid()}") # Checks against DNA alphabet.is_valid()pcoa()skbio.tree.TreeNodeskbio.diversity.betafrom skbio import DNA
# ❌ BAD: Manual reverse complement with string logic
rc_str = seq_str[::-1].replace('A', 't').replace('T', 'a')... # Fragile
# ✅ GOOD: Built-in validated method
rc_seq = DNA(seq_str).reverse_complement()
# ❌ BAD: Using raw lists for community analysis
# data = [[1, 0, 5], [2, 1, 0]]
# dist = manual_bray_curtis(data)
# ✅ GOOD: Using DistanceMatrix
from skbio.stats.distance import DistanceMatrix
dm = DistanceMatrix(matrix_data, ids=['S1', 'S2', 'S3'])
# ❌ BAD: Stripping metadata to run scikit-learn
# vals = seq.values # Metadata lost!
# ✅ GOOD: Process within skbio or use standard IO
seq.write('output.fasta')from skbio import DNA, Protein
# Sequence with quality scores
seq = DNA("ACGT", positional_metadata={'quality': [30, 35, 40, 20]})
# Slicing preserves metadata
sub = seq[1:3]
print(sub.positional_metadata) # {'quality': [35, 40]}
# K-mer frequencies
freqs = seq.kmer_frequencies(k=2)
# Translation (DNA -> Protein)
protein = DNA("ATGCGA").translate()from skbio.diversity import alpha
counts = [10, 0, 5, 2, 20] # Species abundances
otus = ['OTU1', 'OTU2', 'OTU3', 'OTU4', 'OTU5']
shannon = alpha.shannon(counts)
simpson = alpha.simpson(counts)
observed_otus = alpha.observed_otus(counts)
print(f"Shannon Index: {shannon:.3f}")from skbio.diversity import beta
import numpy as np
# Abundance table (samples x taxa)
data = np.array([[10, 20, 0],
[5, 15, 2],
[0, 1, 30]])
ids = ['Sample1', 'Sample2', 'Sample3']
# Bray-Curtis distance
bc_dm = beta.pw_distances(data, ids=ids, metric='braycurtis')
# Weighted UniFrac (Requires a tree)
# unifrac_dm = beta.weighted_unifrac(data, otu_ids, tree)from skbio import TreeNode
from io import StringIO
# Load Newick tree
tree_str = "((A:0.1, B:0.2)C:0.3, D:0.4)E;"
tree = TreeNode.read(StringIO(tree_str))
# Traverse
for node in tree.tips():
print(f"Leaf: {node.name}, dist: {node.length}")
# Find Lowest Common Ancestor
lca = tree.find_lca(['A', 'B'])
print(f"LCA of A and B is {lca.name}")
# Rooting
tree.root_at('D')from skbio.stats.ordination import pcoa
from skbio.stats.distance import DistanceMatrix
# Create DM
dm = DistanceMatrix([[0, 0.5, 0.8], [0.5, 0, 0.2], [0.8, 0.2, 0]],
ids=['A', 'B', 'C'])
# Perform PCoA
results = pcoa(dm)
# View proportion of variance explained
print(results.proportion_explained)
# Access coordinates for plotting
coords = results.samplesfrom skbio.stats.distance import permanova
# metadata linking IDs to groups
metadata = pd.DataFrame({
'BodySite': ['Gut', 'Gut', 'Skin', 'Skin']},
index=['S1', 'S2', 'S3', 'S4'])
# Assuming dm is a DistanceMatrix of S1..S4
# results = permanova(dm, metadata, column='BodySite', permutations=999)
# print(results['p-value'])def plot_microbiome_pcoa(abundance_df, metadata_df, group_col):
"""Full workflow: Abundance -> Distance -> PCoA -> Table."""
from skbio.diversity import beta
from skbio.stats.ordination import pcoa
# 1. Calculate Bray-Curtis distance
dm = beta.pw_distances(abundance_df.values, ids=abundance_df.index, metric='braycurtis')
# 2. PCoA
pc = pcoa(dm)
# 3. Merge with metadata for plotting
plot_data = pc.samples[['PC1', 'PC2']].join(metadata_df)
return plot_data # Ready for Seaborn/Plotlydef filter_low_quality_sequences(sequences, min_avg_qual=30):
"""Filters DNA sequences based on Phred scores."""
valid_seqs = []
for s in sequences:
# Assuming quality is in positional_metadata
avg_q = np.mean(s.positional_metadata['quality'])
if avg_q >= min_avg_qual:
valid_seqs.append(s)
return valid_seqsdef get_tip_distances(tree):
"""Returns a distance matrix of all tips in a tree."""
dm = tree.tip_tip_distances()
return dmbeta.pw_distancestree.preorder()tree.postorder()# ✅ Solution: Ensure alignment
common_ids = set(dm.ids).intersection(metadata.index)
dm_sub = dm.filter(common_ids)
metadata_sub = metadata.loc[list(common_ids)]skbio.diversity.beta.unweighted_unifrac# ❌ Problem: DNA("ACGN") raises error if 'N' isn't handled
# ✅ Solution: scikit-bio DNA supports IUPAC (N, R, Y, etc.) by default.
# But if you have non-standard characters:
from skbio import Sequence
custom = Sequence("ACG-X") # Generic sequence permits anything