Loading...
Loading...
R programming for data analysis, visualization, and statistical workflows. Use when working with R scripts (.R), Quarto documents (.qmd), RMarkdown (.Rmd), or R projects. Covers tidyverse workflows, ggplot2 visualizations, statistical analysis, epidemiological methods, and reproducible research practices.
npx skill4agent add crypticpy/rdata r-data-science|>%>%library(tidyverse)
# CSV (most common)
df <- read_csv("data/raw/dataset.csv")
# Excel
df <- readxl::read_excel("data/raw/dataset.xlsx", sheet = "Sheet1")
# Clean column names immediately
df <- df |> janitor::clean_names()analysis_data <- raw_data |>
# Clean and filter
filter(!is.na(key_variable)) |>
# Transform variables
mutate(
date = as.Date(date_string, format = "%Y-%m-%d"),
age_group = cut(age, breaks = c(0, 18, 45, 65, Inf),
labels = c("0-17", "18-44", "45-64", "65+"))
) |>
# Summarize
group_by(region, age_group) |>
summarize(
n = n(),
mean_value = mean(outcome, na.rm = TRUE),
.groups = "drop"
)ggplot(df, aes(x = date, y = count, color = category)) +
geom_line(linewidth = 1) +
scale_color_brewer(palette = "Set2") +
labs(
title = "Trend Over Time",
subtitle = "By category",
x = "Date",
y = "Count",
color = "Category",
caption = "Source: Dataset Name"
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold")
)case_countscalculate_rate()filter_outliers()compute_summary()patient_datasurveillance_df<-=+|>:::$|># Good
result <- data |>
filter(year >= 2020) |>
group_by(county) |>
summarize(total = sum(cases))
# Bad
result<-data|>filter(year>=2020)|>group_by(county)|>summarize(total=sum(cases))<-=->=# Load and clean surveillance data ------------------------------------------
# Calculate age-adjusted rates
# Using direct standardization method per CDC guidelines
adjusted_rate <- calculate_adjusted_rate(df, standard_pop)library(tidyverse) # Loads: ggplot2, dplyr, tidyr, readr, purrr, tibble, stringr, forcats| Task | Package | Key Functions |
|---|---|---|
| CSV/TSV | readr | |
| Excel | readxl, writexl | |
| SAS/SPSS/Stata | haven | |
| JSON | jsonlite | |
| Databases | DBI, dbplyr | |
| Task | Package | Key Functions |
|---|---|---|
| Column cleaning | janitor | |
| Date handling | lubridate | |
| String operations | stringr | |
| Missing data | naniar | |
| Task | Package | Key Functions |
|---|---|---|
| Core plotting | ggplot2 | |
| Extensions | ggrepel, patchwork | |
| Interactive | plotly | |
| Tables | gt, kableExtra | |
| Task | Package | Key Functions |
|---|---|---|
| Model summaries | broom | |
| Regression | stats, lme4 | |
| Survival | survival | |
| Survey data | survey | |
| Task | Package | Key Functions |
|---|---|---|
| Epi calculations | epiR | |
| Outbreak tools | incidence2, epicontacts | |
| Disease mapping | SpatialEpi | |
| Surveillance | surveillance | |
| Rate calculations | epitools | |
project/
├── project.Rproj
├── renv.lock
├── CLAUDE.md # Claude Code configuration
├── README.md
├── data/
│ ├── raw/ # Never modify
│ └── processed/ # Analysis-ready
├── R/ # Custom functions
├── scripts/ # Pipeline scripts
├── analysis/ # Quarto documents
└── output/
├── figures/
└── tables/---
title: "Analysis Title"
author: "Your Name"
date: today
format:
html:
toc: true
code-fold: true
embed-resources: true
execute:
warning: false
message: false
---# Initialize (once per project)
renv::init()
# Snapshot dependencies after installing packages
renv::snapshot()
# Restore environment (for collaborators)
renv::restore()# ============================================================================
# Title: Analysis of [Subject]
# Author: [Name]
# Date: [Date]
# Purpose: [One-sentence description]
# Input: data/processed/clean_data.csv
# Output: output/figures/trend_plot.png
# ============================================================================df |>
group_by(category) |>
summarize(
n = n(),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
median = median(value, na.rm = TRUE),
q25 = quantile(value, 0.25, na.rm = TRUE),
q75 = quantile(value, 0.75, na.rm = TRUE)
) |>
gt::gt() |>
gt::fmt_number(columns = where(is.numeric), decimals = 2)model <- glm(outcome ~ exposure + age + sex, data = df, family = binomial)
# Tidy coefficients
tidy_results <- broom::tidy(model, conf.int = TRUE, exponentiate = TRUE) |>
select(term, estimate, conf.low, conf.high, p.value)
# Model diagnostics
glance_results <- broom::glance(model)library(incidence2)
# Create incidence object
inc <- incidence(
df,
date_index = "onset_date",
interval = "week",
groups = "outcome_category"
)
# Plot
plot(inc) +
labs(
title = "Epidemic Curve",
x = "Week of Onset",
y = "Number of Cases"
) +
theme_minimal()# Age-adjusted rates using direct standardization
library(epitools)
# Stratum-specific counts and populations
result <- ageadjust.direct(
count = df$cases,
pop = df$population,
stdpop = standard_population$pop # e.g., US 2000 standard
)# Validate data before analysis
stopifnot(
"Data frame is empty" = nrow(df) > 0,
"Missing required columns" = all(c("id", "date", "value") %in% names(df)),
"Duplicate IDs found" = !any(duplicated(df$id))
)
# Informative warnings for data quality issues
if (sum(is.na(df$key_var)) > 0) {
warning(sprintf("%d missing values in key_var (%.1f%%)",
sum(is.na(df$key_var)),
100 * mean(is.na(df$key_var))))
}# Check file exists before reading
if (!file.exists(filepath)) {
stop(sprintf("File not found: %s", filepath))
}
# Create directories if needed
dir.create("output/figures", recursive = TRUE, showWarnings = FALSE)# Use data.table for >1M rows
library(data.table)
dt <- fread("large_file.csv")
# Or use arrow for very large/parquet files
library(arrow)
df <- read_parquet("data.parquet")
# Lazy evaluation with duckdb
library(duckdb)
con <- dbConnect(duckdb())
df_lazy <- tbl(con, "data.csv")# Good: vectorized
df$rate <- df$cases / df$population * 100000
# Avoid: row-by-row loop
for (i in 1:nrow(df)) {
df$rate[i] <- df$cases[i] / df$population[i] * 100000
}