Loading...
Loading...
Walk through omicverse's single-cell preprocessing tutorials to QC PBMC3k data, normalise counts, detect HVGs, and run PCA/embedding pipelines on CPU, CPU–GPU mixed, or GPU stacks.
npx skill4agent add starlitnightly/omicverse single-cell-preprocessing-with-omicverset_preprocess.ipynbt_preprocess_cpu.ipynbt_preprocess_gpu.ipynbomicverse as ovscanpy as scov.plot_set(font_path='Arial')ov.ov_plot_set()%load_ext autoreload%autoreload 2pbmc3k_filtered_gene_bc_matrices.tar.gzdata/filtered_gene_bc_matrices/hg19/sc.read_10x_mtx(..., var_names='gene_symbols', cache=True)write/ov.pp.qc(adata, tresh={'mito_perc': 0.2, 'nUMIs': 500, 'detected_genes': 250}, doublets_method='scrublet')doublets_methodov.utils.store_layers(adata, layers='counts')ov.pp.preprocess(adata, mode='shiftlog|pearson', n_HVGs=2000, target_sum=5e5)target_sum=Noneov.pp.recover_counts(...)adata.layers['recover_counts'].raw.rawadata.raw = adataadata.raw = adata.copy()ov.utils.retrieve_layers(adata_counts, layers='counts')ov.pp.scale(adata)ov.pp.pca(adata, layer='scaled', n_pcs=50)sc.pp.neighbors(adata, n_neighbors=15, n_pcs=50, use_rep='scaled|original|X_pca')ov.pp.neighbors(..., use_rep='scaled|original|X_pca')ov.pp.neighbors(..., method='cagra')ov.utils.mde(...)ov.pp.umap(adata)ov.pp.mde(...)ov.pp.tsne(...)ov.pp.sude(...)ov.pp.leiden(adata, resolution=1)ov.single.leiden(adata, resolution=1.0)ov.pp.score_genes_cell_cyclecolor='leiden'# Check if leiden clustering exists, if not, run it
if 'leiden' not in adata.obs:
if 'neighbors' not in adata.uns:
ov.pp.neighbors(adata, n_neighbors=15, use_rep='X_pca')
ov.single.leiden(adata, resolution=1.0)ov.pl.embedding(...)ov.utils.embedding(...)leidencolor=adata.obsadata.write('write/pbmc3k_preprocessed.h5ad')plt.savefig(...)t_preprocess.ipynbretrieve_layerst_preprocess_cpu.ipynbdoublets_method='scrublet't_preprocess_gpu.ipynbrapids-singlecellov.pp.anndata_to_GPUov.pp.anndata_to_CPUmethod='cagra'sc.read_10x_mtxvar_names='gene_symbols'nvidia-smiov.pp.preprocessscaled|original|X_pcaov.pp.scaleov.pp.pcaadata.obs# Step 1: Check if batch column exists, create default if not
if 'batch' not in adata.obs.columns:
adata.obs['batch'] = 'batch_1' # Default single batch
# Step 2: Handle NaN/missing values - CRITICAL!
adata.obs['batch'] = adata.obs['batch'].fillna('unknown')
# Step 3: Convert to categorical for efficient memory usage
adata.obs['batch'] = adata.obs['batch'].astype('category')
# Now safe to use in batch-aware operations
ov.pp.combat(adata, batch='batch') # or other batch correction methods# WRONG! Using batch column without validation can cause NaN errors
# ov.pp.combat(adata, batch='batch') # May fail if batch has NaN values!
# WRONG! Assuming batch column exists
# adata.obs['batch'].unique() # KeyError if column doesn't exist!fillna()# Complete defensive batch preparation pattern:
def prepare_batch_column(adata, batch_key='batch', default_batch='batch_1'):
"""Prepare batch column for batch-aware operations."""
if batch_key not in adata.obs.columns:
adata.obs[batch_key] = default_batch
adata.obs[batch_key] = adata.obs[batch_key].fillna('unknown')
adata.obs[batch_key] = adata.obs[batch_key].astype(str).astype('category')
return adataseurat_v3ValueError: Extrapolation not allowed with blending# Robust HVG selection for any dataset size
try:
sc.pp.highly_variable_genes(
adata,
flavor='seurat_v3',
n_top_genes=2000,
batch_key='batch' # if batch correction is needed
)
except ValueError as e:
if 'Extrapolation' in str(e) or 'LOESS' in str(e):
# Fallback to simpler method for small datasets
sc.pp.highly_variable_genes(
adata,
flavor='seurat', # Works with any size
n_top_genes=2000
)
else:
raise# cell_ranger flavor is more robust for batched data
sc.pp.highly_variable_genes(
adata,
flavor='cell_ranger', # No LOESS, works with batches
n_top_genes=2000,
batch_key='batch'
)seuratcell_rangerseurat_v3# Safe batch-aware HVG pattern
def safe_highly_variable_genes(adata, batch_key='batch', n_top_genes=2000):
"""Select HVGs with automatic fallback for small batches."""
try:
sc.pp.highly_variable_genes(
adata, flavor='seurat_v3', n_top_genes=n_top_genes, batch_key=batch_key
)
except ValueError:
# Fallback for small batches
sc.pp.highly_variable_genes(
adata, flavor='seurat', n_top_genes=n_top_genes
)shiftlog|pearsonmethod='cagra't_preprocess.ipynbt_preprocess_cpu.ipynbt_preprocess_gpu.ipynbreference.md