5.3 Build and Cache KEGG GMT

We retrieve KEGG pathway gene sets for E. coli K-12 MG1655 from the KEGG API using KEGGREST. Both the GMT and the locus tag to gene symbol mapping table are cached locally as RDS files so this step only runs once.

Why do we need a mapping step? The nf-core/rnaseq pipeline uses locus tags (b0001, b0002…) as gene identifiers when no gene_name field is present in the reference GTF, which is common for prokaryotic genomes. The KEGG pathway GMT uses gene symbols (aceE, recA…). We build a locus tag to gene symbol map directly from KEGG and convert before analysis.

Tip: The first time you run this chunk it queries the KEGG API (~2-3 min). After that both RDS files load instantly. Do not delete the files in data/databases/.

gmt_path <- file.path(git_root, "data", "databases",
                      "KEGG_Pathways_Ecoli_MG1655_GeneSymbol.rds")
map_path <- file.path(git_root, "data", "databases",
                      "KEGG_locus_to_symbol_Ecoli_MG1655.rds")

if (file.exists(gmt_path) && file.exists(map_path)) {

  kegg_gmt <- readRDS(gmt_path)
  id_map   <- readRDS(map_path)
  cat("Loaded cached GMT    :", nrow(kegg_gmt), "pathways\n")
  cat("Loaded cached ID map :", nrow(id_map), "entries\n")

} else {

  cat("Fetching KEGG data for E. coli MG1655 — ~2-3 min...\n")

  ecoli_pathways <- keggList("pathway", "eco")
  pathway_ids    <- names(ecoli_pathways)
  pathway_names  <- gsub(" - Escherichia coli K-12 MG1655$", "",
                         as.character(ecoli_pathways))

  cat("Total pathways found:", length(pathway_ids), "\n")

  batches         <- split(pathway_ids, ceiling(seq_along(pathway_ids) / 10))
  pathway_genes   <- list()
  locus_to_symbol <- list()

  for (i in seq_along(batches)) {
    cat("  Batch", i, "of", length(batches), "\n")
    for (pid in batches[[i]]) {
      tryCatch({
        result <- keggGet(pid)[[1]]
        if (!is.null(result$GENE) && length(result$GENE) >= 2) {
          locus  <- result$GENE[seq(1, length(result$GENE), by = 2)]
          symbl  <- trimws(sub(";.*", "", result$GENE[seq(2, length(result$GENE), by = 2)]))
          pathway_genes[[pid]] <- symbl
          for (j in seq_along(locus)) locus_to_symbol[[locus[j]]] <- symbl[j]
        }
      }, error = function(e) cat("  Skipped:", pid, "\n"))
    }
    Sys.sleep(0.3)
  }

  kegg_gmt <- data.frame(
    ontology_id    = names(pathway_genes),
    ontology_name  = pathway_names[match(names(pathway_genes), pathway_ids)],
    list_of_values = I(pathway_genes),
    stringsAsFactors = FALSE
  )

  id_map <- data.frame(
    locus_tag   = names(locus_to_symbol),
    gene_symbol = unlist(locus_to_symbol),
    stringsAsFactors = FALSE
  )

  dir.create(dirname(gmt_path), recursive = TRUE, showWarnings = FALSE)
  saveRDS(kegg_gmt, gmt_path)
  saveRDS(id_map,   map_path)
  cat("GMT saved    :", nrow(kegg_gmt), "pathways\n")
  cat("ID map saved :", nrow(id_map), "entries\n")
}
## Loaded cached GMT    : 121 pathways
## Loaded cached ID map : 1765 entries
cat("\nExample pathways:\n")
## 
## Example pathways:
head(kegg_gmt$ontology_name, 6)
## [1] "Glycolysis / Gluconeogenesis"            
## [2] "Citrate cycle (TCA cycle)"               
## [3] "Pentose phosphate pathway"               
## [4] "Pentose and glucuronate interconversions"
## [5] "Fructose and mannose metabolism"         
## [6] "Galactose metabolism"

5.3.1 Convert Locus Tags to Gene Symbols

res_df_sym <- res_df %>%
  left_join(id_map, by = c("gene" = "locus_tag")) %>%
  mutate(gene = ifelse(!is.na(gene_symbol), gene_symbol, gene)) %>%
  select(-gene_symbol)

res_sig_sym <- res_sig %>%
  left_join(id_map, by = c("gene" = "locus_tag")) %>%
  mutate(gene = ifelse(!is.na(gene_symbol), gene_symbol, gene)) %>%
  select(-gene_symbol)

cat("Genes mapped to symbols  :", sum(res_df$gene %in% id_map$locus_tag),
    "of", nrow(res_df), "\n")
## Genes mapped to symbols  : 1576 of 3698
cat("Remaining as locus tags  :", sum(!res_df_sym$gene %in% id_map$gene_symbol),
    "(unannotated in KEGG - expected)\n")
## Remaining as locus tags  : 2122 (unannotated in KEGG - expected)
cat("\nExample converted gene names:\n")
## 
## Example converted gene names:
head(res_sig_sym$gene, 10)
##  [1] "ompF"  "b3508" "b3510" "garR"  "b0553" "b3127" "gadC"  "b3509" "gadB" 
## [10] "b3514"

Note: Genes remaining as locus tags are not annotated in any KEGG pathway. They are included in the ORA background but will not match any gene set. This is expected — not all E. coli genes are assigned to KEGG pathways.