library(gwasplot)
#> ℹ Setting duckdb_max_memory to 7GB, using 80% of available system memory.
#> ℹ Change this with options(duckdb_max_memory = 'XGB')
#>
#> Attaching package: 'gwasplot'
#> The following object is masked from 'package:stats':
#>
#> qqplot
# Approximate chromosome sizes (Mb, hg38)
chr_mb <- c(249, 242, 198, 190, 181, 171, 159, 145, 138, 134,
135, 133, 115, 107, 102, 90, 83, 80, 59, 63, 47, 51)
sim_gwas <- function(n_total, hits = NULL,
chroms = paste0("chr", 1:22)) {
sizes <- chr_mb[seq_along(chroms)]
dfs <- Map(function(ch, mb) {
n <- max(1L, round(n_total * mb / sum(sizes)))
data.frame(
CHROM = ch,
POS = sort(sample.int(mb * 1e6L, n)),
PVALUE = runif(n)
)
}, chroms, sizes)
df <- do.call(rbind, dfs)
for (h in hits) {
idx <- df$CHROM == h$chrom & abs(df$POS - h$pos) < h$window
df$PVALUE[idx] <- runif(sum(idx), h$pmin, h$pmax)
}
df
}
set.seed(123)
gwas_df <- sim_gwas(
n_total = 100000,
chroms = paste0("chr", 1:5),
hits = list(
list(chrom = "chr1", pos = 100e6, window = 2e6, pmin = 1e-20, pmax = 1e-8),
list(chrom = "chr3", pos = 60e6, window = 2e6, pmin = 1e-15, pmax = 1e-8),
list(chrom = "chr4", pos = 80e6, window = 2e6, pmin = 1e-12, pmax = 1e-8)
)
)
nrow(gwas_df)
#> [1] 100000After association testing it is common to identify a small set of independent signals from a larger pool of genome-wide significant variants. identify_loci() uses a greedy, distance-based algorithm: it picks the most significant variant as a locus lead, assigns all variants within a specified window to that locus, then repeats on the remaining unassigned variants.
Simulate data with signal clusters
The simulation places three tight clusters of significant variants on chromosomes 1, 3, and 4.
Filter to genome-wide significant hits
select_top_hits() filters rows to those below a p-value threshold (default 5×10⁻⁸).
top_hits <- select_top_hits(gwas_df, threshold = 5e-8)
nrow(top_hits)
#> [1] 1173Identify independent loci
identify_loci() adds two columns to the filtered data:
-
locus_id— integer locus index (1, 2, 3, …) -
is_lead—TRUEfor the most significant variant in each locus
loci <- identify_loci(top_hits, window_kb = 500)
head(loci)
#> CHROM POS PVALUE locus_id is_lead
#> chr1.9582 chr1 101116924 2.932216e-12 1 TRUE
#> chr1.9438 chr1 99423996 1.061566e-11 2 TRUE
#> chr4.7532 chr4 79413859 3.030097e-11 3 TRUE
#> chr3.5761 chr3 60575929 6.373109e-11 4 TRUE
#> chr1.9488 chr1 100059230 7.147869e-11 5 TRUE
#> chr3.5755 chr3 60493613 7.346702e-11 4 FALSELead variant table
Filtering to lead variants gives one row per independent signal.
leads <- loci |>
dplyr::filter(is_lead) |>
dplyr::select(locus_id, CHROM, POS, PVALUE)
leads
#> locus_id CHROM POS PVALUE
#> chr1.9582 1 chr1 101116924 2.932216e-12
#> chr1.9438 2 chr1 99423996 1.061566e-11
#> chr4.7532 3 chr4 79413859 3.030097e-11
#> chr3.5761 4 chr3 60575929 6.373109e-11
#> chr1.9488 5 chr1 100059230 7.147869e-11
#> chr3.5540 6 chr3 58170371 9.580576e-11
#> chr1.9649 7 chr1 101815045 1.040804e-10
#> chr4.7595 8 chr4 80060358 1.041392e-10
#> chr3.5809 9 chr3 61090783 1.057991e-10
#> chr3.5891 10 chr3 61952497 1.126412e-10
#> chr4.7489 11 chr4 78903586 1.421319e-10
#> chr4.7719 12 chr4 81346759 1.544312e-10
#> chr3.5629 13 chr3 59085865 1.616394e-10
#> chr4.7418 14 chr4 78291694 1.924481e-10
#> chr4.7661 15 chr4 80753029 3.342818e-10
#> chr1.9366 16 chr1 98790304 3.479665e-10
#> chr3.5701 17 chr3 59899745 3.978974e-10
#> chr1.9295 18 chr1 98105444 6.391552e-10
#> chr4.7785 19 chr4 81962052 6.958988e-10
#> chr1.9531 20 chr1 100564739 4.443700e-09The three signals on chromosomes 1, 3, and 4 are recovered as expected.
Adding gene annotations
When working with a DuckDB-backed GWASFormatter object, find_nearest_gene() can annotate each variant with its closest gene, adding gene_name, gene_id, and distance columns. See ?find_nearest_gene for details.