5.3 Build and Cache KEGG GMT
We retrieve KEGG pathway gene sets for E. coli K-12 MG1655 from the KEGG API using
KEGGREST. Both the GMT and the locus tag to gene symbol mapping table are cached locally
as RDS files so this step only runs once.
Why do we need a mapping step?
The nf-core/rnaseq pipeline uses locus tags (b0001, b0002…) as gene identifiers
when no gene_name field is present in the reference GTF, which is common for
prokaryotic genomes. The KEGG pathway GMT uses gene symbols (aceE, recA…).
We build a locus tag to gene symbol map directly from KEGG and convert before analysis.
Tip: The first time you run this chunk it queries the KEGG API (~2-3 min).
After that both RDS files load instantly. Do not delete the files in
data/databases/.
gmt_path <- file.path(git_root, "data", "databases",
"KEGG_Pathways_Ecoli_MG1655_GeneSymbol.rds")
map_path <- file.path(git_root, "data", "databases",
"KEGG_locus_to_symbol_Ecoli_MG1655.rds")
if (file.exists(gmt_path) && file.exists(map_path)) {
kegg_gmt <- readRDS(gmt_path)
id_map <- readRDS(map_path)
cat("Loaded cached GMT :", nrow(kegg_gmt), "pathways\n")
cat("Loaded cached ID map :", nrow(id_map), "entries\n")
} else {
cat("Fetching KEGG data for E. coli MG1655 — ~2-3 min...\n")
ecoli_pathways <- keggList("pathway", "eco")
pathway_ids <- names(ecoli_pathways)
pathway_names <- gsub(" - Escherichia coli K-12 MG1655$", "",
as.character(ecoli_pathways))
cat("Total pathways found:", length(pathway_ids), "\n")
batches <- split(pathway_ids, ceiling(seq_along(pathway_ids) / 10))
pathway_genes <- list()
locus_to_symbol <- list()
for (i in seq_along(batches)) {
cat(" Batch", i, "of", length(batches), "\n")
for (pid in batches[[i]]) {
tryCatch({
result <- keggGet(pid)[[1]]
if (!is.null(result$GENE) && length(result$GENE) >= 2) {
locus <- result$GENE[seq(1, length(result$GENE), by = 2)]
symbl <- trimws(sub(";.*", "", result$GENE[seq(2, length(result$GENE), by = 2)]))
pathway_genes[[pid]] <- symbl
for (j in seq_along(locus)) locus_to_symbol[[locus[j]]] <- symbl[j]
}
}, error = function(e) cat(" Skipped:", pid, "\n"))
}
Sys.sleep(0.3)
}
kegg_gmt <- data.frame(
ontology_id = names(pathway_genes),
ontology_name = pathway_names[match(names(pathway_genes), pathway_ids)],
list_of_values = I(pathway_genes),
stringsAsFactors = FALSE
)
id_map <- data.frame(
locus_tag = names(locus_to_symbol),
gene_symbol = unlist(locus_to_symbol),
stringsAsFactors = FALSE
)
dir.create(dirname(gmt_path), recursive = TRUE, showWarnings = FALSE)
saveRDS(kegg_gmt, gmt_path)
saveRDS(id_map, map_path)
cat("GMT saved :", nrow(kegg_gmt), "pathways\n")
cat("ID map saved :", nrow(id_map), "entries\n")
}## Loaded cached GMT : 121 pathways
## Loaded cached ID map : 1765 entries
##
## Example pathways:
## [1] "Glycolysis / Gluconeogenesis"
## [2] "Citrate cycle (TCA cycle)"
## [3] "Pentose phosphate pathway"
## [4] "Pentose and glucuronate interconversions"
## [5] "Fructose and mannose metabolism"
## [6] "Galactose metabolism"