JMnorm: a novel Joint Multi-feature normalization method for integrative and comparative epigenomics


Guanjue Xiang, Yuchun Guo, David Bumcrot, Alla Sigova
CAMP4 Therapeutics Corp., Cambridge, MA, USA

Description: This R Markdown file demonstrates the use of JMnorm to normalize the target signal matrix “TCD8.raw_sigmat.txt” against the reference signal matrix “ref.raw_sigmat.txt”. The input matrices, “TCD8.raw_sigmat.txt” and “ref.raw_sigmat.txt”, should be formatted as N-by-(M+1) matrices, where N represents the number of cCREs, and M represents the number of chromatin features. The first column of each matrix contains the cCRE IDs. The signal values for each chromatin feature in the cCREs are expected to be non-negative and in linear scale.


run JMnorm

Load Libraries

start.time <- Sys.time()
library(readr)
library(pheatmap)
library(dynamicTreeCut)

Load JMnorm source code & setup working directory.

!!! Change the file path of “JMnorm.script.R” and working directory path.

# Source the JMnorm script 
source('/Users/guanjuexiang/Documents/projects/git/JMnorm/bin/JMnorm_core/JMnorm.script.R')

# Set the working directory
setwd('/Users/guanjuexiang/Documents/projects/git/JMnorm/test_data/')

Set parameters

# get feature list
feature_list = c("ATAC", "H3K27ac", "H3K27me3", "H3K36me3", "H3K4me1", "H3K4me3", "H3K9me3")

# added 1 to avoid 0 in log transformation
add_sn = 1

# input file names
# The testing signal matrices can be found in this link: https://github.com/camp4tx/JMnorm/tree/main/docs
refer_sigmat_file = 'ref.raw_sigmat.txt'
target_sigmat_file = 'TCD8.raw_sigmat.txt'
after_JMnorm_output_target_sigmat_file = 'TCD8.JMnorm_sigmat.txt'

Read reference signal matrix

# read refe_ave_sigmat
file_ref = as.data.frame(read_table(refer_sigmat_file, col_names = T))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   cCREids = col_double(),
##   ATAC = col_double(),
##   H3K27ac = col_double(),
##   H3K27me3 = col_double(),
##   H3K36me3 = col_double(),
##   H3K4me1 = col_double(),
##   H3K4me3 = col_double(),
##   H3K9me3 = col_double()
## )
file_ref_sig0 = file_ref[,-1]

Normalize signal to noise ratio (SNR) of reference signal matrix

# normalize signal to noise ratio (SNR) across features (Optional)
file_ref_sig = normalize_snr(file_ref_sig0)
# if user decided to NOT normalize SNR across different features, then use the following command instead
# file_ref_sig = file_ref_sig0

# log2 transformation
file_ref_sig_log2 = log2(file_ref_sig+add_sn)

Determine number of clusters in reference signal matrix

# Determine the K value for JMnorm based on reference signal matrix
K = determineK(file_ref_sig_log2, sample_size=20000)
##  ..cutHeight not given, setting it to 8.87  ===>  99% of the (truncated) height range in dendro.
##  ..done.
## [1] "cutreeDynamic: "
## ref_km_cluster
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 1980  928  902  785  670  653  642  632  570  559  558  554  544  527  507  503 
##   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32 
##  500  500  488  479  478  448  446  439  404  396  393  376  360  333  319  314 
##   33   34   35   36   37   38   39 
##  312  299  281  257  244  217  203

Read target signal matrix

# read target signal matrix
target_ct = as.data.frame(read_table(target_sigmat_file, col_names = T))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   cCREids = col_double(),
##   ATAC = col_double(),
##   H3K27ac = col_double(),
##   H3K27me3 = col_double(),
##   H3K36me3 = col_double(),
##   H3K4me1 = col_double(),
##   H3K4me3 = col_double(),
##   H3K9me3 = col_double()
## )
# add noise to avoid many 0s
target_ct_sig0 = target_ct[,-1]+matrix(runif(prod(dim(target_ct)), min = -0.00001, max = 0.0001), nrow = dim(target_ct)[1])

# avoid negative values
target_ct_sig0 = target_ct_sig0-min(target_ct_sig0)

Normalize signal to noise ratio (SNR) of target signal matrix

# normalize signal to noise ratio (SNR) across features
target_ct_sig = normalize_snr(target_ct_sig0)

# log2 transformation
target_ct_sig_log2 = log2(target_ct_sig+add_sn)

Use JMnorm to normalize target signal matrix against reference signal matrix

# JMnorm
target_ct_sig_JMnorm = JMnorm(target_ct_sig_log2, file_ref_sig_log2, K)
## [1]    34 32795 37316
## [1]    16 35061 14988
## [1]    13 49384 22165
## [1]   24 7220 7043
## [1]    14 50556 26648
## [1]    23 30484 24946
## [1]    17 20507 18099
## [1]    27 28008 26640
## [1]     5 41008 25930
## [1]     4 22239 68896
## [1]    22 27531 40119
## [1]    15 29566 27136
## [1]     3 24341 45554
## [1]    11 15113 30515
## [1]    12 28482 18798
## [1]    37 17254 24912
## [1]    36 20973 11308
## [1]     9 26209 21586
## [1]   26 4539 9889
## [1]    32 15292 11037
## [1]    33 50138 39076
## [1]    21 15544 52248
## [1]    29 18884 13010
## [1]    38 11384 18603
## [1]     2 16858 18118
## [1]    19 21352 20864
## [1]     1 33686 42270
## [1]    39 30609 30725
## [1]    35 14575 10341
## [1]    31 36318 22579
## [1]    28 27210 31413
## [1]    25 33083 39393
## [1]     6 42261 28134
## [1]    18 36514 61185
## [1]    20 12250  5969
## [1]     8 20030 18513
## [1]   30 7856 7582
## [1]     7 47084 40008
## [1]    10 61680 50322
Write JMnorm normalized signal matrix

# add colnames
colnames(file_ref_sig_log2) = feature_list

# correlation between features in reference cell-type
file_ref_sig_log2_cor = cor(file_ref_sig_log2)

# add colnames
colnames(target_ct_sig_JMnorm) = feature_list

# convert JMnorm normalized target signal matrix back to linear scale
target_ct_sig_JMnorm_linear = 2^(target_ct_sig_JMnorm)-add_sn

# write output JMnorm normalized target signal matrix
output_mat = cbind(target_ct[,1], round(target_ct_sig_JMnorm_linear, 3))
colnames(output_mat) = colnames(target_ct)
write.table(output_mat, after_JMnorm_output_target_sigmat_file, quote=F, col.names=T, row.names = F, sep='\t')
Plot cross-feature correlation matrix heatmap

Get cross-feature correlation matrix

# correlation between features in target cell-type before JMnorm
target_ct_sig_cor = cor(target_ct_sig_log2)

# correlation between features in target cell-type after JMnorm
target_ct_sig_JMnorm_cor = cor(target_ct_sig_JMnorm)

# calculate the R2 between the correlation matrix of target cell-type and reference cell-type
r2_after = r2(as.numeric(target_ct_sig_JMnorm_cor), as.numeric(file_ref_sig_log2_cor))

r2_before = r2(as.numeric(target_ct_sig_cor), as.numeric(file_ref_sig_log2_cor))
print(paste0('R2 of correlation matrix between target reference before JMnorm: ', r2_before))
## [1] "R2 of correlation matrix between target reference before JMnorm: 0.897602700414569"
print(paste0('R2 of correlation matrix between target reference after JMnorm: ', r2_after))
## [1] "R2 of correlation matrix between target reference after JMnorm: 0.996315502185354"
  1. plot the correlation matrix of target cell-type AFTER JMnorm
par(mfrow=c(1,2))
# plot the correlation matrix of target cell-type after JMnorm
breaksList = seq(-1,1, by = 0.001)
my_colorbar=colorRampPalette(c('blue', 'white', 'red'))(n = length(breaksList))
pheatmap(target_ct_sig_JMnorm_cor, color=my_colorbar, breaks = breaksList, cluster_rows = F, cluster_cols = F, show_rownames = F, show_colnames = T, cex=1.2, main='Target Corr matrix (After JMnorm)')

  1. plot the correlation matrix of target cell-type BEFORE JMnorm
# plot the correlation matrix of target cell-type before JMnorm
colnames(target_ct_sig_cor) = feature_list
pheatmap(target_ct_sig_cor, color=my_colorbar, breaks = breaksList, cluster_rows = F, cluster_cols = F, show_rownames = F, show_colnames = T, cex=1.2, main='Target Corr matrix (Before JMnorm)')

  1. plot the correlation matrix of reference
breaksList = seq(-1,1, by = 0.001)
my_colorbar=colorRampPalette(c('blue', 'white', 'red'))(n = length(breaksList))
# plot the correlation matrix of reference cell-type
pheatmap(file_ref_sig_log2_cor, color=my_colorbar, breaks = breaksList, cluster_rows = F, cluster_cols = F, show_rownames = F, show_colnames = T, cex=1.2, main='Reference Corr matrix')

  1. plot scatterplots of correlations between reference and target (Before & After JMnorm)
par(mfrow=c(1,2))

# plot the correlation matrix of reference cell-type
plot(as.numeric(file_ref_sig_log2_cor), as.numeric(target_ct_sig_JMnorm_cor), xlab='Reference', ylab='Target', xlim=c(-0.5,1), ylim=c(-0.5,1), main='After JMnorm', cex.axis=1.5)
abline(0,1)

plot(as.numeric(file_ref_sig_log2_cor), as.numeric(target_ct_sig_cor), xlab='Reference', ylab='Target', xlim=c(-0.5,1), ylim=c(-0.5,1), main='Before JMnorm', cex.axis=1.5)
abline(0,1)

end.time <- Sys.time()
print(paste0('Running time on a MacBookPro 2021 M1Pro Chip 16GB REM: ', round((end.time - start.time), 3)))
## [1] "Running time on a MacBookPro 2021 M1Pro Chip 16GB REM: 1.323"

Reference

Guanjue Xiang, Yuchun Guo, David Bumcrot, Alla Sigova. JMnorm: a novel Joint Multi-feature normalization method for integrative and comparative epigenomics (2023)