Loading...
Loading...
Best practices for creating comprehensive Jupyter notebook data analyses with statistical rigor, outlier handling, and publication-quality visualizations
npx skill4agent add delphine-l/claude_global jupyter-notebook-analysisdata# BAD - Shadows global 'data' variable
for i, (sp, data) in enumerate(species_by_gc_content[:10], 1):
val = data['gc_content']
print(f'{sp}: {val}')data# GOOD - Uses specific name
for i, (sp, sp_data) in enumerate(species_by_gc_content[:10], 1):
val = sp_data['gc_content']
print(f'{sp}: {val}')dataitemvaluesp_datarow_datainv_datafor data in dataset: # Shadows 'data'
for i, data in enumerate(): # Shadows 'data'
for key, data in dict.items() # Shadows 'data'# Assumed column name
df_filtered = df[df['scientific_name'] == target] # KeyError!
# Actual column name was 'Scientific Name' (capitalized with space)import pandas as pd
df = pd.read_csv('data.csv')
# ALWAYS print columns before processing
print("Available columns:")
print(df.columns.tolist())
# Then write filtering code with correct names
df_filtered = df[df['Scientific Name'] == target_species] # Correct# At the start of your script
def verify_required_columns(df, required_cols):
"""Verify DataFrame has required columns."""
missing = [col for col in required_cols if col not in df.columns]
if missing:
print(f"ERROR: Missing columns: {missing}")
print(f"Available columns: {df.columns.tolist()}")
sys.exit(1)
# Use it
required = ['Scientific Name', 'tolid', 'accession']
verify_required_columns(df, required)scientific_nameScientific NameScientificNamespecies_idspeciesSpecies IDgenome_sizeGenome sizeGenomeSize# Add at script start for easy debugging
if '--debug' in sys.argv or len(df.columns) < 10:
print(f"Columns ({len(df.columns)}): {df.columns.tolist()}")import numpy as np
workflow_counts = [entity_data[id]['workflow_count'] for id in entity_data.keys()]
q1 = np.percentile(workflow_counts, 25)
q3 = np.percentile(workflow_counts, 75)
iqr = q3 - q1
upper_bound = q3 + 1.5 * iqr
outliers = [id for id in entity_data.keys()
if entity_data[id]['workflow_count'] > upper_bound]
for id in outliers:
del entity_data[id]values = [entity_data[id]['metric'] for id in entity_data.keys()]
threshold = np.percentile(values, 95)
viz_entities = [id for id in entity_data.keys()
if entity_data[id]['metric'] <= threshold]
# Use viz_entities for plotting
# Use full entity_data.keys() for statistics# After removing workflow count outliers, also remove heterozygosity outliers
heterozygosity_values = [species_data[sp]['heterozygosity'] for sp in species_data.keys()]
het_q1 = np.percentile(heterozygosity_values, 25)
het_q3 = np.percentile(heterozygosity_values, 75)
het_iqr = het_q3 - het_q1
het_upper_bound = het_q3 + 1.5 * het_iqr
het_outliers = [sp for sp in species_data.keys()
if species_data[sp]['heterozygosity'] > het_upper_bound]
for sp in het_outliers:
del species_data[sp]
print(f'Removed {len(het_outliers)} heterozygosity outliers (>{het_upper_bound:.2f}%)')
print(f'New heterozygosity range: {min(vals):.2f}% - {max(vals):.2f}%')# Calculate IQR
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
# Define outlier boundaries (standard: 1.5×IQR)
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR
# Filter outliers
outlier_mask = (data >= lower_bound) & (data <= upper_bound)
data_filtered = data[outlier_mask]
n_outliers = (~outlier_mask).sum()
# IMPORTANT: Report outliers removed
print(f"Removed {n_outliers} outliers for visualization")
# Add to figure: f"({n_outliers} outliers removed)"# For scatter plots with two dimensions (e.g., size ratio AND absolute size)
outlier_mask = (
(ratio >= Q1_ratio - 1.5*IQR_ratio) &
(ratio <= Q3_ratio + 1.5*IQR_ratio) &
(size >= Q1_size - 1.5*IQR_size) &
(size <= Q3_size + 1.5*IQR_size)
)from scipy import stats
correlation, p_value = stats.pearsonr(x_values, y_values)
sig_text = 'significant' if p_value < 0.05 else 'not significant'ax.text(0.98, 0.02,
f'r = {correlation:.3f}\np = {p_value:.2e}\n({sig_text})\nn = {len(data)} species',
transform=ax.transAxes, ...)from scipy import stats
# Calculate test
data_group1 = df[df['group'] == 'Group1']['metric']
data_group2 = df[df['group'] == 'Group2']['metric']
if len(data_group1) > 0 and len(data_group2) > 0:
stat, pval = stats.mannwhitneyu(data_group1, data_group2, alternative='two-sided')
else:
pval = np.nan
# Add to stats text
if not np.isnan(pval):
stats_text += f"\nMann-Whitney p: {pval:.2e}"Mann-Whitney p: 1.23e-04import pandas as pd
from scipy import stats
def check_confounding(df, method_col, characteristics):
"""
Compare sample characteristics between methods to check for confounding.
Args:
df: DataFrame with samples
method_col: Column indicating method ('Method_A', 'Method_B')
characteristics: List of column names to compare
Returns:
DataFrame with statistical comparison
"""
results = []
for char in characteristics:
# Get data for each method
method_a = df[df[method_col] == 'Method_A'][char].dropna()
method_b = df[df[method_col] == 'Method_B'][char].dropna()
if len(method_a) < 5 or len(method_b) < 5:
continue
# Statistical test
stat, pval = stats.mannwhitneyu(method_a, method_b, alternative='two-sided')
# Calculate effect size (% difference in medians)
pooled_median = pd.concat([method_a, method_b]).median()
effect_pct = (method_a.median() - method_b.median()) / pooled_median * 100
results.append({
'Characteristic': char,
'Method_A_median': method_a.median(),
'Method_A_n': len(method_a),
'Method_B_median': method_b.median(),
'Method_B_n': len(method_b),
'p_value': pval,
'effect_pct': effect_pct,
'significant': pval < 0.05
})
return pd.DataFrame(results)
# Example usage
characteristics = ['genome_size', 'gc_content', 'heterozygosity',
'repeat_content', 'sequencing_coverage']
confounding_check = check_confounding(df, 'curation_method', characteristics)
print(confounding_check)## Genome Characteristics Comparison
**Control Analysis**: Are quality differences due to method or sample properties?
[Table comparing characteristics]
**Conclusion**:
- If no differences → Valid method comparison
- If Method A works with harder samples → Strengthens conclusions
- If Method A works with easier samples → Potential confoundingfig18_genome_size_vs_cpu_hours.pngspecies_datagenome_sizes_fullgenome_sizes_vizsafe_float_convert()# Create template with placeholder variables
template = '''
if len(data_with_species) > 0:
print('Analyzing {display} vs {metric}...\\n')
# Aggregate data per species
species_data = {{}}
for inv in data_with_species:
{name} = safe_float_convert(inv.get('{name}'))
if {name} is None:
continue
# ... analysis code
'''
# Generate multiple cells from characteristics list
characteristics = [
{'name': 'genome_size', 'display': 'Genome Size', 'unit': 'Gb'},
{'name': 'heterozygosity', 'display': 'Heterozygosity', 'unit': '%'},
# ...
]
for char in characteristics:
code = template.format(**char)
# Write to notebook or temp filedef safe_float_convert(value):
"""Convert string to float, handling comma separators"""
if not value or not str(value).strip():
return None
try:
return float(str(value).replace(',', ''))
except (ValueError, TypeError):
return Nonealpha=0.3, linestyle='--'s=[50 + count*20 for count in counts]| Element | Default | Publication | Code |
|---|---|---|---|
| Title | 11-12pt | 18pt (bold) | |
| Axis labels | 10-11pt | 16pt (bold) | |
| Tick labels | 9-10pt | 14pt | |
| Legend | 8-10pt | 12pt | |
| Annotations | 8-10pt | 11-13pt | |
| Data points | 20-36 | 60-100 | |
fig, ax = plt.subplots(figsize=(10, 8))
# Plot data
ax.scatter(x, y, s=80, alpha=0.6) # Larger points
# Titles and labels - BOLD
ax.set_title('Your Title Here', fontsize=18, fontweight='bold')
ax.set_xlabel('X Axis Label', fontsize=16, fontweight='bold')
ax.set_ylabel('Y Axis Label', fontsize=16, fontweight='bold')
# Tick labels
ax.tick_params(axis='both', which='major', labelsize=14)
# Legend
ax.legend(fontsize=12, loc='best')
# Stats box
stats_text = "Statistics:\nMean: 42.5"
ax.text(0.02, 0.98, stats_text, transform=ax.transAxes,
fontsize=13, family='monospace',
bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.3))
# Reference lines - thicker
ax.axhline(y=1.0, linewidth=2.5, linestyle='--', alpha=0.6)# For comparing two groups/conditions
colors = {
'Group_A': '#0173B2', # Blue
'Group_B': '#DE8F05' # Orange
}import matplotlib.pyplot as plt
# Define colorblind-safe palette
CB_COLORS = {
'blue': '#0173B2',
'orange': '#DE8F05',
'green': '#029E73',
'red': '#D55E00',
'purple': '#CC78BC',
'brown': '#CA9161'
}
# Use in plots
plt.scatter(x, y, color=CB_COLORS['blue'], label='Treatment')
plt.scatter(x2, y2, color=CB_COLORS['orange'], label='Control')import matplotlib.pyplot as plt
# After creating your plot
n_group_a = len(df[df['group'] == 'A'])
n_group_b = len(df[df['group'] == 'B'])
total_a = 200
total_b = 350
warning_text = f"⚠️ DATA LIMITATION\n"
warning_text += f"Data availability:\n"
warning_text += f" Group A: {n_group_a}/{total_a} ({n_group_a/total_a*100:.1f}%)\n"
warning_text += f" Group B: {n_group_b}/{total_b} ({n_group_b/total_b*100:.1f}%)\n"
warning_text += f"Severe imbalance limits\nstatistical comparability"
ax.text(0.98, 0.02, warning_text, transform=ax.transAxes,
fontsize=11, verticalalignment='bottom', horizontalalignment='right',
bbox=dict(boxstyle='round', facecolor='red', alpha=0.2,
edgecolor='red', linewidth=2),
family='monospace', color='darkred', fontweight='bold')
# Update title to indicate limitation
ax.set_title('Your Title\n(SUPPLEMENTARY - Limited Data Availability)',
fontsize=14, fontweight='bold')**⚠️ CRITICAL DATA LIMITATION**: This figure suffers from severe data availability bias:
- Group A: 84/200 (42%)
- Group B: 10/350 (3%)
This **8-fold imbalance** severely limits statistical comparability. The 10 Group B
samples are unlikely to be representative of all 350.
**Interpretation**: Comparisons should be interpreted with extreme caution. This
figure is provided for completeness but should be considered **supplementary**.# Random subset to balance groups
if n_group_a > n_group_b * 2:
group_a_subset = df[df['group'] == 'A'].sample(n=n_group_b * 2, random_state=42)
# Use subset for balanced comparison# Analysis Name
## Table of Contents
1. [Data Loading](#data-loading)
2. [Data Quality Metrics](#data-quality-metrics)
3. [Figure 1: Completeness](#figure-1-completeness)
4. [Figure 2: Contiguity](#figure-2-contiguity)
5. [Figure 3: Scaffold Validation](#figure-3-scaffold-validation)
...
10. [Methods](#methods)
11. [References](#references)
---## Data Loading
[Your code/analysis]
---
## Data Quality Metrics
[Your code/analysis]from IPython.display import Markdown
sections = ['Introduction', 'Data Loading', 'Analysis', ...]
toc = "## Table of Contents\n\n"
for i, section in enumerate(sections, 1):
anchor = section.lower().replace(' ', '-')
toc += f"{i}. [{section}](#{anchor})\n"
display(Markdown(toc))## Methods
### Karyotype Data
Karyotype data (diploid 2n and haploid n chromosome numbers) was manually curated from peer-reviewed literature for 97 species representing 17.8% of the VGP Phase 1 dataset (n = 545 assemblies).
#### Sex Chromosome Adjustment
When both sex chromosomes are present in the main haplotype, the expected number of chromosome-level scaffolds is:
**expected_scaffolds = n + 1**
For example:
- Asian elephant: 2n=56, n=28, has X+Y → expected 29 scaffolds
- White-throated sparrow: 2n=82, n=41, has Z+W → expected 42 scaffolds
This adjustment accounts for the biological reality that X and Y (or Z and W) are distinct chromosomes.# Display figure
from IPython.display import Image, display
from pathlib import Path
FIG_DIR = Path('figures/analysis_name')
display(Image(filename=str(FIG_DIR / 'figure_01.png')))# Run in background with token authentication
/path/to/conda/envs/ENV_NAME/bin/jupyter lab --no-browser --port=8888--no-browser--port=8888run_in_background=truehttp://localhost:8888/lab?token=TOKEN_STRING/path/to/conda/envs/ENV_NAME/bin/pip install jupyterlabjqcat notebook.ipynb | jq '.cells[10:20]'cat notebook.ipynb | jq '.cells | length'cat notebook.ipynb | jq '.cells[75:81] | .[].source[:2]'# Cell 6: Load genome metadata
import csv
genome_data = []
with open('genome_metadata.tsv') as f:
reader = csv.DictReader(f, delimiter='\t')
genome_data = list(reader)
genome_lookup = {}
for row in genome_data:
species_id = row['species_id']
if species_id not in genome_lookup:
genome_lookup[species_id] = []
genome_lookup[species_id].append(row)
# Cell 7: Enrich workflow data with genome characteristics
for inv in data:
species_id = inv.get('species_id')
if species_id and species_id in genome_lookup:
genome_info = genome_lookup[species_id][0]
# Add genome characteristics
inv['genome_size'] = genome_info.get('Genome size', '')
inv['heterozygosity'] = genome_info.get('Heterozygosity', '')
# ... other characteristics
else:
# Set to None for missing data
inv['genome_size'] = None
inv['heterozygosity'] = None
# Create filtered dataset
data_with_species = [inv for inv in data if inv.get('species_id') and inv.get('genome_size')]# Daily backup (start of each work session)
./backup_table.sh
# Milestone backup (after major changes)
./backup_table.sh milestone "added genomescope data for 21 species"
# List all backups
./backup_table.sh list
# Restore from backup (with safety backup)
./backup_table.sh restore 2026-01-23backups/
├── daily/ # Rolling 7-day backups (~770KB each)
│ ├── backup_2026-01-17.csv
│ └── backup_2026-01-23.csv
├── milestones/ # Permanent compressed backups (~200KB each)
│ ├── milestone_2026-01-20_initial_enrichment.csv.gz
│ └── milestone_2026-01-23_recovered_accessions.csv.gz
├── CHANGELOG.md # Auto-generated change log
└── README.md # User documentation# Install backup system in any repository
install-backup-system -f your_data_file.csv# Check how many entities have both metrics
species_with_metric_a = set(inv.get('species_id') for inv in data
if inv.get('metric_a'))
species_with_metric_b = set(inv.get('species_id') for inv in data
if inv.get('metric_b'))
overlap = species_with_metric_a.intersection(species_with_metric_b)
print(f"Species with both metrics: {len(overlap)}")
if len(overlap) < 10:
print("⚠️ Warning: Limited data for correlation analysis")
print(f" Metric A: {len(species_with_metric_a)} species")
print(f" Metric B: {len(species_with_metric_b)} species")
print(f" Overlap: {len(overlap)} species")# Validation cell - place before error-prone sections
print('=== VARIABLE VALIDATION ===')
print(f'Type of data: {type(data)}')
print(f'Is data a list? {isinstance(data, list)}')
if isinstance(data, list):
print(f'Length: {len(data)}')
if len(data) > 0:
print(f'First item type: {type(data[0])}')
print(f'First item keys: {list(data[0].keys())[:10]}')
elif isinstance(data, dict):
print(f'⚠️ WARNING: data is a dict, not a list!')
print(f'Dict keys: {list(data.keys())[:10]}')
print(f'This suggests variable shadowing occurred.')import json
# Read notebook
with open('notebook.ipynb', 'r') as f:
notebook = json.load(f)
# Create new cell
new_cell = {
"cell_type": "code",
"execution_count": None,
"metadata": {},
"outputs": [],
"source": [line + '\n' for line in code.split('\n')]
}
# Insert at position
insert_position = 50
notebook['cells'] = (notebook['cells'][:insert_position] +
[new_cell] +
notebook['cells'][insert_position:])
# Write back
with open('notebook.ipynb', 'w') as f:
json.dump(notebook, f, indent=1)NotebookEditEdit.ipynb# After adding Mann-Whitney test to figure generation:
NotebookEdit(
notebook_path="/path/to/notebook.ipynb",
cell_id="cell-14", # Found via grep or Read
cell_type="markdown",
new_source="Updated description mentioning Mann-Whitney test..."
)# Locate figure references
grep -n "figure_name.png" notebook.ipynb
# Or use Glob + Grep
grep -n "Figure 4" notebook.ipynb# Before: 2 panels
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# After: 1 panel
fig, ax = plt.subplots(1, 1, figsize=(10, 6))# Before
plt.savefig('06_scaffold_l50_l90_comparison.png')
# After
plt.savefig('06_scaffold_l50_comparison.png')display(Image(...'06_scaffold_l50_comparison.png'))rm figures/*_l50_l90_*.png