Skip to content

Commit

Permalink
add segSummarize
Browse files Browse the repository at this point in the history
  • Loading branch information
PoisonAlien committed Sep 11, 2024
1 parent 8f6e4f7 commit de9e59f
Show file tree
Hide file tree
Showing 8 changed files with 251 additions and 3 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ Imports:
RColorBrewer,
Rhtslib,
survival,
DNAcopy
DNAcopy,
pheatmap
Suggests:
berryFunctions,
Biostrings,
Expand All @@ -43,7 +44,6 @@ Suggests:
RaggedExperiment,
rmarkdown,
S4Vectors,
pheatmap,
curl
LinkingTo:
Rhtslib,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ export(rainfallPlot)
export(read.maf)
export(readGistic)
export(sampleSwaps)
export(segSummarize)
export(segmentLogR)
export(setdiffMAF)
export(signatureEnrichment)
Expand Down
177 changes: 177 additions & 0 deletions R/segSummarize.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#' Summarize CBS segmentation results
#' @details
#' A handy function to summarize CBS segmentation results. Takes segmentation results generated by DNAcopy package \code{\link{segment}} and summarizes the CN for each cytoband and chromosomal arms.
#'
#' @param seg segmentation results generated from \code{DNAcopy} package \code{\link{segment}}. Input should be a multi-sample segmentation file or a data.frame. First six columns should correspond to sample name, chromosome, start, end, Num_Probes, Segment_Mean in log2 scale. (default output format from DNAcopy)
#' @param build genome build. Default hg19. Can be hg19, hg38. If other than these, use `cytoband` argument
#' @param cytoband cytoband data from UCSC genome browser. Only needed if `build` is other than `hg19` or `hg38`
#' @param thr threshold to call amplification and deletion. Any cytobands or chromosomal arms with median logR above or below this will be called. Default 0.3
#' @param verbose Default TRUE
#' @param maf optional MAF
#' @param genes Add mutation status of these genes as an annotation to the heatmap
#' @param topanno annotation for each sample. This is passed as an input to `annotation_col` of `pheatmap`
#' @param topannocols annotation cols for `topanno`. This is passed as an input to `annotation_colors` of `pheatmap`
#' @return List of median CN values for each cytoband and chromosomal arm along with the plotting matrix
#' @export
#' @examples
#' laml.seg <- system.file("extdata", "LAML_CBS_segments.tsv.gz", package = "maftools")
#' segSummarize(seg = laml.seg)
#'
#' #Heighlight some genes as annotation
#' laml.maf = system.file("extdata", "tcga_laml.maf.gz", package = "maftools") #MAF file
#' laml.clin = system.file('extdata', 'tcga_laml_annot.tsv', package = 'maftools') #clinical data
#' laml = read.maf(maf = laml.maf, clinicalData = laml.clin)
#'
#' segSummarize(seg = laml.seg, maf = laml, genes = c("FLT3", "DNMT3A"))

segSummarize = function(seg = NULL, build = "hg19", cytoband = NULL, thr = 0.3, verbose = TRUE, maf = NULL, genes = NULL, topanno = NULL, topannocols = NA){

if(is.null(cytoband)){
build = match.arg(arg = build, choices = c("hg19", "hg38"))
cytoband <- system.file("extdata", paste0(build, "_cytobands.tsv.gz"), package = "maftools")
if(!file.exists(cytoband)){
stop("Cytoband file does not exist!")
}
}

if (is(object = seg, class2 = "data.frame")) {
seg = data.table::as.data.table(seg)
}else {
seg = data.table::fread(input = seg)
}
colnames(seg)[1:6] = c("Sample", "Chromosome", "Start", "End", "Num_Probes", "Segment_Mean")
seg$Chromosome = gsub(pattern = "^chr", replacement = "", x = seg$Chromosome)
data.table::setkey(x = seg, Chromosome, Start, End)

if(nrow(seg[,.N,Sample]) < 3){
stop("Not enough samples to proceed (N: ", nrow(seg[,.N,Sample]), ")")
}

cy = data.table::fread(input = cytoband)
colnames(cy) = c("Chromosome", "Start", "End", "name", "stain")
cy$arm = substr(cy$name, 1, 1)
cy$Chromosome = gsub(pattern = "^chr", replacement = "", x = cy$Chromosome)
data.table::setkey(x = cy, Chromosome, Start, End)

seg_events = lapply(split(seg, seg$Sample), function(x){
.seg2arm(se = x, cy = cy, thr = thr)
})

cy[, id := paste(Chromosome, name, sep = "_")]
cy[, arm := paste(Chromosome, substr(name, 1, 1), sep = "_")]

#Summarize CN per chromosomal cytoband
cytoband_events = lapply(seg_events, function(x) x$seg) |> data.table::rbindlist(idcol = "Tumor_Sample_Barcode")
cytoband_events = data.table::dcast(data = cytoband_events, formula = chromosome+arm ~ Tumor_Sample_Barcode, value.var = "cn")

cytoband_events[, id := paste(chromosome, arm, sep = "_")]
cytoband_events$chromosome = NULL
cytoband_events$arm = NULL
data.table::setDF(x = cytoband_events, cytoband_events$id)
cytoband_events$id = NULL
cytoband_events = cytoband_events[cy[id %in% rownames(cytoband_events), id],,]
cytoband_events[cytoband_events > 4] = 4
cytoband_events = cytoband_events[complete.cases(cytoband_events),]

##row anno
cn_band_anno = as.data.frame(cy[id %in% rownames(cytoband_events)])
cn_band_anno = data.frame(row.names = cn_band_anno$id, chr = cn_band_anno$Chromosome, arm = substr(x = cn_band_anno$name, 1, 1))

#order by chromosome for human build
chr_ord = intersect(c(1:22, "X", "Y"), names(split(cn_band_anno, cn_band_anno$chr)))
cn_band_anno = cn_band_anno[unlist(lapply(split(cn_band_anno, cn_band_anno$chr)[chr_ord], rownames), use.names = FALSE), , drop = FALSE]
cytoband_events = cytoband_events[rownames(cn_band_anno),, drop = FALSE]

##Heatmap colors
cols_chromosome = setNames(object = rep(c("#95a5a6", "#7f8c8d"), length(unique(cn_band_anno$chr))), nm = unique(cn_band_anno$chr))
cols_chromosome = cols_chromosome[unique(cn_band_anno$chr)]
cols_chromosome_arms = setNames(object = c("#ecf0f1", "#bdc3c7"), nm = c("p", "q"))

gene2barcode = gene2barcode_cols = NA
show_annotation_legend = FALSE

if(!is.null(maf)){
if(!is.null(genes)){
gene2barcode = genesToBarcodes(maf = maf, genes = genes, justNames = TRUE)
print(gene2barcode)
gene2barcode = lapply(gene2barcode, function(x) data.table::data.table(Tumor_Sample_Barcode = x, status = 'yes')) |> data.table::rbindlist(idcol = "Hugo_Symbol") |> data.table::dcast(formula = Tumor_Sample_Barcode ~ Hugo_Symbol, value.var = "status")
data.table::setDF(x = gene2barcode, rownames = gene2barcode$Tumor_Sample_Barcode)
gene2barcode = gene2barcode[, 2:ncol(gene2barcode), drop = FALSE]
gene2barcode_cols = lapply(X = colnames(gene2barcode), function(x) setNames(object = "#34495e", nm = "yes"))
names(gene2barcode_cols) = colnames(gene2barcode)
}
}

if(!is.null(topanno)){
if(!is.null(nrow(gene2barcode))){
gene2barcode = merge(gene2barcode, topanno, by = "row.names", all = TRUE)
rownames(gene2barcode) = gene2barcode$`Row.names`
gene2barcode$`Row.names` = NULL
}else{
gene2barcode = topanno
}
show_annotation_legend = TRUE
}

pheatmap::pheatmap(
cytoband_events,
cluster_rows = FALSE,
cluster_cols = TRUE,
show_colnames = FALSE,
annotation_row = cn_band_anno,
color = grDevices::colorRampPalette(rev(brewer.pal(
n = 7, name = "PiYG"
)))(100),
annotation_colors = c(list(
chr = cols_chromosome,
arm = cols_chromosome_arms
), gene2barcode_cols, topannocols),
show_rownames = FALSE,
border_color = 'black', annotation_legend = show_annotation_legend, legend_breaks = seq(0, 4, 1), legend_labels = c("0", "1", "2", "3", ">4"), annotation_col = gene2barcode, na_col = "gray"
)

#Arm events
arm_events = lapply(seg_events, function(x) x$arm) |> data.table::rbindlist(fill = TRUE, use.names = TRUE, idcol = "Tumor_Sample_Barcode")
arm_events = arm_events[!is.na(Variant_Classification)][order(chromosome)]
arm_events_n = arm_events[,.N,.(arm,Variant_Classification)][!is.na(Variant_Classification)][order(-N)]

if(verbose){
message("Recurrent chromosomal arm aberrations")
print(arm_events_n)
}

#Focal band events
band_events = lapply(seg_events, function(x) x$seg) |> data.table::rbindlist(fill = TRUE, use.names = TRUE, idcol = "Tumor_Sample_Barcode")
band_events = band_events[!is.na(Variant_Classification)][order(chromosome)]
colnames(band_events)[3] = "cytoband"
band_events_n = band_events[,.N,.(chromosome, cytoband,Variant_Classification)][!is.na(Variant_Classification)][order(-N)]

attr(cytoband_events, "meta") = list(top_anno = gene2barcode, top_anno_cols = gene2barcode_cols)

list(heatmap_matrix = cytoband_events, arm_events = arm_events, band_events = band_events)
}

.seg2arm = function (se, cy = NULL, thr = 0.3) {

data.table::setkey(x = se, Chromosome, Start, End)
se_olaps = data.table::foverlaps(x = cy, y = se)
cytoband_mean = se_olaps[, median(Segment_Mean, na.rm = TRUE),
.(Chromosome, name)]
colnames(cytoband_mean) = c("chromosome", "arm", "logR")
cytoband_mean[, `:=`(cn, 2 * (2^logR))]
cytoband_mean$Variant_Classification = ifelse(cytoband_mean$logR >
thr, "Amp", no = ifelse(cytoband_mean$logR < -thr, yes = "Del",
no = NA))
cytoband_mean = cytoband_mean[!chromosome %in% c("chrX",
"chrY")]
arm_mean = se_olaps[, median(Segment_Mean, na.rm = TRUE),
.(Chromosome, arm)]
colnames(arm_mean) = c("chromosome", "arm", "logR")
arm_mean[, `:=`(cn, 2 * (2^logR))]
arm_mean$Variant_Classification = ifelse(arm_mean$logR >
thr, "Gain", no = ifelse(arm_mean$logR < -thr, yes = "Loss",
no = NA))
arm_mean$arm = paste(arm_mean$chromosome, arm_mean$arm, sep = "_")
arm_mean = arm_mean[!chromosome %in% c("chrX", "chrY")]
list(arm = arm_mean, seg = cytoband_mean)
}
Binary file added inst/extdata/LAML_CBS_segments.tsv.gz
Binary file not shown.
Binary file added inst/extdata/hg19_cytobands.tsv.gz
Binary file not shown.
Binary file added inst/extdata/hg38_cytobands.tsv.gz
Binary file not shown.
57 changes: 57 additions & 0 deletions man/segSummarize.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 14 additions & 1 deletion vignettes/maftools.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ If you find this tool useful, please cite:

------------------------------------------------------------------------

***Mayakonda A, Lin DC, Assenov Y, Plass C, Koeffler HP. 2018. Maftools: efficient and comprehensive analysis of somatic variants in cancer. [Genome Resarch. PMID: 30341162](https://doi.org/10.1101/gr.239244.118)***
***Mayakonda A, Lin DC, Assenov Y, Plass C, Koeffler HP. 2018. Maftools: efficient and comprehensive analysis of somatic variants in cancer. [Genome Research. PMID: 30341162](https://doi.org/10.1101/gr.239244.118)***

------------------------------------------------------------------------

Expand Down Expand Up @@ -279,6 +279,19 @@ This is similar to oncoplots except for copy number data. One can again sort the
gisticOncoPlot(gistic = laml.gistic, clinicalData = getClinicalData(x = laml), clinicalFeatures = 'FAB_classification', sortByAnnotation = TRUE, top = 10)
```

## CBS segments

### Summarizing chromosomal arm level CN

A multi-sample CBS segments file can be summarized to get the CN status of each cytoband and chromosomal arms. Below plot shows 5q deletions in a subset of AML samples.
The function returns the plot data and the CN status of each chromosomal cytoband and arms for every sample.

```{r segSummarize}
laml.seg <- system.file("extdata", "LAML_CBS_segments.tsv.gz", package = "maftools")
segSummarize_results = segSummarize(seg = laml.seg)
```


### Visualizing CBS segments

```{r plotCBSsegments, fig.height=2.5,fig.width=4,fig.align='left'}
Expand Down

0 comments on commit de9e59f

Please sign in to comment.