3.4 Preparing the Data

Before building the DESeq2 object we need to:

  1. Set factor levels so that control is the reference level (the denominator in fold-change calculations).
  2. Clean column names in the count matrix to remove whitespace or special characters.
  3. Align the sample order between the count matrix and the metadata — DESeq2 requires these to match exactly.
condition_levels   <- c("control", "treatment")
samples_info$group <- factor(samples_info$group, levels = condition_levels)

colnames(count_genes) <- gsub(
  "[^[:alnum:]_]", "",
  gsub("\\s+", "_", trimws(colnames(count_genes)))
)

if (!all(colnames(count_genes) %in% samples_info$sample)) {
  stop("⛔ Some samples in count matrix are NOT in metadata! Check sample names.")
}
samples_info <- samples_info[match(colnames(count_genes), samples_info$sample), ]

print(data.frame(count_col    = colnames(count_genes),
                 metadata_row = samples_info$sample))
##   count_col metadata_row
## 1        C1           C1
## 2        C2           C2
## 3        C3           C3
## 4      sac1         sac1
## 5      sac2         sac2
## 6      sac3         sac3