Description: This R Markdown file demonstrates the use of JMnorm to normalize the target signal matrix “TCD8.raw_sigmat.txt” against the reference signal matrix “ref.raw_sigmat.txt”. The input matrices, “TCD8.raw_sigmat.txt” and “ref.raw_sigmat.txt”, should be formatted as N-by-(M+1) matrices, where N represents the number of cCREs, and M represents the number of chromatin features. The first column of each matrix contains the cCRE IDs. The signal values for each chromatin feature in the cCREs are expected to be non-negative and in linear scale.
Load Libraries
start.time <- Sys.time()
library(readr)
library(pheatmap)
library(dynamicTreeCut)
Load JMnorm source code & setup working directory.
!!! Change the file path of “JMnorm.script.R” and working directory path.
# Source the JMnorm script
source('/Users/guanjuexiang/Documents/projects/git/JMnorm/bin/JMnorm_core/JMnorm.script.R')
# Set the working directory
setwd('/Users/guanjuexiang/Documents/projects/git/JMnorm/test_data/')
Set parameters
# get feature list
feature_list = c("ATAC", "H3K27ac", "H3K27me3", "H3K36me3", "H3K4me1", "H3K4me3", "H3K9me3")
# added 1 to avoid 0 in log transformation
add_sn = 1
# input file names
# The testing signal matrices can be found in this link: https://github.com/camp4tx/JMnorm/tree/main/docs
refer_sigmat_file = 'ref.raw_sigmat.txt'
target_sigmat_file = 'TCD8.raw_sigmat.txt'
after_JMnorm_output_target_sigmat_file = 'TCD8.JMnorm_sigmat.txt'
Read reference signal matrix
# read refe_ave_sigmat
file_ref = as.data.frame(read_table(refer_sigmat_file, col_names = T))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## cCREids = col_double(),
## ATAC = col_double(),
## H3K27ac = col_double(),
## H3K27me3 = col_double(),
## H3K36me3 = col_double(),
## H3K4me1 = col_double(),
## H3K4me3 = col_double(),
## H3K9me3 = col_double()
## )
file_ref_sig0 = file_ref[,-1]
Normalize signal to noise ratio (SNR) of reference signal matrix
# normalize signal to noise ratio (SNR) across features (Optional)
file_ref_sig = normalize_snr(file_ref_sig0)
# if user decided to NOT normalize SNR across different features, then use the following command instead
# file_ref_sig = file_ref_sig0
# log2 transformation
file_ref_sig_log2 = log2(file_ref_sig+add_sn)
Determine number of clusters in reference signal matrix
# Determine the K value for JMnorm based on reference signal matrix
K = determineK(file_ref_sig_log2, sample_size=20000)
## ..cutHeight not given, setting it to 8.87 ===> 99% of the (truncated) height range in dendro.
## ..done.
## [1] "cutreeDynamic: "
## ref_km_cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1980 928 902 785 670 653 642 632 570 559 558 554 544 527 507 503
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 500 500 488 479 478 448 446 439 404 396 393 376 360 333 319 314
## 33 34 35 36 37 38 39
## 312 299 281 257 244 217 203
Read target signal matrix
# read target signal matrix
target_ct = as.data.frame(read_table(target_sigmat_file, col_names = T))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## cCREids = col_double(),
## ATAC = col_double(),
## H3K27ac = col_double(),
## H3K27me3 = col_double(),
## H3K36me3 = col_double(),
## H3K4me1 = col_double(),
## H3K4me3 = col_double(),
## H3K9me3 = col_double()
## )
# add noise to avoid many 0s
target_ct_sig0 = target_ct[,-1]+matrix(runif(prod(dim(target_ct)), min = -0.00001, max = 0.0001), nrow = dim(target_ct)[1])
# avoid negative values
target_ct_sig0 = target_ct_sig0-min(target_ct_sig0)
Normalize signal to noise ratio (SNR) of target signal matrix
# normalize signal to noise ratio (SNR) across features
target_ct_sig = normalize_snr(target_ct_sig0)
# log2 transformation
target_ct_sig_log2 = log2(target_ct_sig+add_sn)
Use JMnorm to normalize target signal matrix against reference signal matrix
# JMnorm
target_ct_sig_JMnorm = JMnorm(target_ct_sig_log2, file_ref_sig_log2, K)
## [1] 34 32795 37316
## [1] 16 35061 14988
## [1] 13 49384 22165
## [1] 24 7220 7043
## [1] 14 50556 26648
## [1] 23 30484 24946
## [1] 17 20507 18099
## [1] 27 28008 26640
## [1] 5 41008 25930
## [1] 4 22239 68896
## [1] 22 27531 40119
## [1] 15 29566 27136
## [1] 3 24341 45554
## [1] 11 15113 30515
## [1] 12 28482 18798
## [1] 37 17254 24912
## [1] 36 20973 11308
## [1] 9 26209 21586
## [1] 26 4539 9889
## [1] 32 15292 11037
## [1] 33 50138 39076
## [1] 21 15544 52248
## [1] 29 18884 13010
## [1] 38 11384 18603
## [1] 2 16858 18118
## [1] 19 21352 20864
## [1] 1 33686 42270
## [1] 39 30609 30725
## [1] 35 14575 10341
## [1] 31 36318 22579
## [1] 28 27210 31413
## [1] 25 33083 39393
## [1] 6 42261 28134
## [1] 18 36514 61185
## [1] 20 12250 5969
## [1] 8 20030 18513
## [1] 30 7856 7582
## [1] 7 47084 40008
## [1] 10 61680 50322
# add colnames
colnames(file_ref_sig_log2) = feature_list
# correlation between features in reference cell-type
file_ref_sig_log2_cor = cor(file_ref_sig_log2)
# add colnames
colnames(target_ct_sig_JMnorm) = feature_list
# convert JMnorm normalized target signal matrix back to linear scale
target_ct_sig_JMnorm_linear = 2^(target_ct_sig_JMnorm)-add_sn
# write output JMnorm normalized target signal matrix
output_mat = cbind(target_ct[,1], round(target_ct_sig_JMnorm_linear, 3))
colnames(output_mat) = colnames(target_ct)
write.table(output_mat, after_JMnorm_output_target_sigmat_file, quote=F, col.names=T, row.names = F, sep='\t')
Get cross-feature correlation matrix
# correlation between features in target cell-type before JMnorm
target_ct_sig_cor = cor(target_ct_sig_log2)
# correlation between features in target cell-type after JMnorm
target_ct_sig_JMnorm_cor = cor(target_ct_sig_JMnorm)
# calculate the R2 between the correlation matrix of target cell-type and reference cell-type
r2_after = r2(as.numeric(target_ct_sig_JMnorm_cor), as.numeric(file_ref_sig_log2_cor))
r2_before = r2(as.numeric(target_ct_sig_cor), as.numeric(file_ref_sig_log2_cor))
print(paste0('R2 of correlation matrix between target reference before JMnorm: ', r2_before))
## [1] "R2 of correlation matrix between target reference before JMnorm: 0.897602700414569"
print(paste0('R2 of correlation matrix between target reference after JMnorm: ', r2_after))
## [1] "R2 of correlation matrix between target reference after JMnorm: 0.996315502185354"
par(mfrow=c(1,2))
# plot the correlation matrix of target cell-type after JMnorm
breaksList = seq(-1,1, by = 0.001)
my_colorbar=colorRampPalette(c('blue', 'white', 'red'))(n = length(breaksList))
pheatmap(target_ct_sig_JMnorm_cor, color=my_colorbar, breaks = breaksList, cluster_rows = F, cluster_cols = F, show_rownames = F, show_colnames = T, cex=1.2, main='Target Corr matrix (After JMnorm)')
# plot the correlation matrix of target cell-type before JMnorm
colnames(target_ct_sig_cor) = feature_list
pheatmap(target_ct_sig_cor, color=my_colorbar, breaks = breaksList, cluster_rows = F, cluster_cols = F, show_rownames = F, show_colnames = T, cex=1.2, main='Target Corr matrix (Before JMnorm)')
breaksList = seq(-1,1, by = 0.001)
my_colorbar=colorRampPalette(c('blue', 'white', 'red'))(n = length(breaksList))
# plot the correlation matrix of reference cell-type
pheatmap(file_ref_sig_log2_cor, color=my_colorbar, breaks = breaksList, cluster_rows = F, cluster_cols = F, show_rownames = F, show_colnames = T, cex=1.2, main='Reference Corr matrix')
par(mfrow=c(1,2))
# plot the correlation matrix of reference cell-type
plot(as.numeric(file_ref_sig_log2_cor), as.numeric(target_ct_sig_JMnorm_cor), xlab='Reference', ylab='Target', xlim=c(-0.5,1), ylim=c(-0.5,1), main='After JMnorm', cex.axis=1.5)
abline(0,1)
plot(as.numeric(file_ref_sig_log2_cor), as.numeric(target_ct_sig_cor), xlab='Reference', ylab='Target', xlim=c(-0.5,1), ylim=c(-0.5,1), main='Before JMnorm', cex.axis=1.5)
abline(0,1)
end.time <- Sys.time()
print(paste0('Running time on a MacBookPro 2021 M1Pro Chip 16GB REM: ', round((end.time - start.time), 3)))
## [1] "Running time on a MacBookPro 2021 M1Pro Chip 16GB REM: 1.323"
Guanjue Xiang, Yuchun Guo, David Bumcrot, Alla Sigova. JMnorm: a novel Joint Multi-feature normalization method for integrative and comparative epigenomics (2023)