REMO/cluster_chromosome.R at main · stuart-lab/REMO

236 lines (184 loc) · 7.41 KB
library(peakcluster)
library(irlba)
library(RcppHNSW)
library(igraph)
library(uwot)
library(ggplot2)
library(edgeR)
args <- commandArgs(trailingOnly = TRUE)
chromosome <- args[1]
message(chromosome)
outdir <- "./remo"
dir.create(file.path(outdir), showWarnings = FALSE)
load_encode <- function(
    chromosome,
    experiments,
    p = "ENCODE/data/combined/chr",
    use = c("ATAC-seq", "DNase-seq", "Histone ChIP-seq", "TF ChIP-seq")
    d <- t(read.table(paste0(p, .Platform$file.sep, chromosome, ".csv.gz"), sep = ","))
    colnames(d) <- readLines(paste0(p, .Platform$file.sep, chromosome, "_colnames.txt"))
    rownames(d) <- readLines("ENCODE/data/combined/matrix/encode_rownames.txt")
    experiment_filt <- experiments[rownames(d), ]
    exp_use <- (experiment_filt$File.assembly == "GRCh38") & (experiment_filt$Assay %in% use)
    d <- d[exp_use, ]
    return(d)
load_atac <- function(
    chromosome,
    whole_body = "scATAC_atlas/data/zhang/chromosome/",
    adult_brain = "scATAC_atlas/data/brain/chromosome/",
    dev_brain = "scATAC_atlas/data/devbrain/chromosome/",
    pbmc = "scATAC_atlas/data/pbmc/chromosome/",
    adrenal = "scATAC_atlas/data/adrenal/",
    esophagus = "scATAC_atlas/data/esophagus/",
    heart_fetal = "scATAC_atlas/data/heart_fetal/",
    heartRV = "scATAC_atlas/data/heartRV/",
    left_colon = "scATAC_atlas/data/left_colon/",
    liver = "scATAC_atlas/data/liver/",
    psoas_muscle = "scATAC_atlas/data/psoas_muscle/"
    # adult whole body
    scatac <- t(read.table(paste0(whole_body, chromosome, ".csv.gz"), sep = ","))
    colnames(scatac) <- readLines(paste0(whole_body, chromosome, "_colnames.txt"))
    rownames(scatac) <- readLines("scATAC_atlas/data/zhang/matrix/scatac_rownames.txt")
    # adult brain
    brain <- t(read.table(paste0(adult_brain, chromosome, ".csv.gz"), sep = ","))
    colnames(brain) <- readLines(paste0(adult_brain, chromosome, "_colnames.txt"))
    rownames(brain) <- readLines("scATAC_atlas/data/brain/matrix/scatac_rownames.txt")
    rownames(brain) <- paste0("BRAIN_", rownames(brain))
    # fetal brain
    devbrain <- t(read.table(paste0(dev_brain, chromosome, ".csv.gz"), sep = ","))
    colnames(devbrain) <- readLines(paste0(dev_brain, chromosome, "_colnames.txt"))
    rownames(devbrain) <- readLines("scATAC_atlas/data/devbrain/matrix/scatac_rownames.txt")
    rownames(devbrain) <- paste0("DEVBRAIN_", rownames(devbrain))
    pbmc_mat <- t(read.table(paste0(pbmc, chromosome, ".csv.gz"), sep = ","))
    colnames(pbmc_mat) <- readLines(paste0(pbmc, chromosome, "_colnames.txt"))
    rownames(pbmc_mat) <- readLines("scATAC_atlas/data/pbmc/matrix/scatac_rownames.txt")
    rownames(pbmc_mat) <- paste0("PBMC_", rownames(pbmc_mat))
    # adrenal
    adrenal_mat <- as.matrix(readRDS(paste0(adrenal, "adrenal_", chromosome, "_pseudobulk.rds")))
    # muscle
    psoas_muscle <- as.matrix(readRDS(paste0(psoas_muscle, "psoas_muscle_", chromosome, "_pseudobulk.rds")))
    # esophagus
    esophagus <- as.matrix(readRDS(paste0(esophagus, "esophagus_", chromosome, "_pseudobulk.rds")))
    # heart_fetal
    heart_fetal <- as.matrix(readRDS(paste0(heart_fetal, "heart_fetal_", chromosome, "_pseudobulk.rds")))
    # heartRV
    heartRV <- as.matrix(readRDS(paste0(heartRV, "heartRV_", chromosome, "_pseudobulk.rds")))
    # left_colon
    left_colon <- as.matrix(readRDS(paste0(left_colon, "left_colon_", chromosome, "_pseudobulk.rds")))
    # liver
    liver <- as.matrix(readRDS(paste0(liver, "liver_", chromosome, "_pseudobulk.rds")))
    return(list(scatac, brain, devbrain, pbmc_mat,
               adrenal_mat, psoas_muscle, esophagus, heart_fetal,
               heartRV, left_colon, liver))
# Load ENCODE data and filter experiments
# data processing for chromosome
hic_files <- list.files("hi-c/data", full.names = TRUE)
experiments <- read.table("ENCODE/metadata_filtered.tsv", sep = "\t", header = TRUE, fill = TRUE, quote = "")
rownames(experiments) <- experiments$File.accession
ccre <- read.table("combined_cre.tsv", sep = "\t", header = FALSE)
colnames(ccre) <- c("chr", "start", "stop", "ident", "class")
rownames(ccre) <- ccre$ident
ccre$midpoint <- (ccre$start + ccre$stop) / 2
encode <- load_encode(chromosome = chromosome, experiments = experiments, use = c("Histone ChIP-seq", "TF ChIP-seq"))
rm(experiments)
ccre_subs <- ccre[colnames(encode), ]
## scATAC ##
scatac_use <- load_atac(chromosome = chromosome)
# decide which CRE to keep based on minimum accessibility across datasets
cs_thresh <- 100
cs_all <- sapply(scatac_use, colSums, simplify = TRUE)
cs_max <- apply(cs_all, 1, max)
cre_keep <- cs_max > cs_thresh
# filter datasets
encode <- encode[, cre_keep]
# reorder on position
ccre_subs <- ccre[colnames(encode), ]
ordering <- order(ccre_subs$midpoint, decreasing = FALSE)
ccre_subs <- ccre_subs[ordering, ]
encode <- encode[, ordering]
for (i in seq_along(scatac_use)) {
    scatac_use[[i]] <- scatac_use[[i]][, cre_keep]
    scatac_use[[i]] <- scatac_use[[i]][, ordering]
rm(cre_keep)
rm(ordering)
normalize_atac <- function(m, cs_thresh = 100, df_thresh = 0.7) {
    # filter low coverage celltypes
    df <- rowSums(m > 0) / ncol(m)
    ct_use <- df > df_thresh
    m <- m[ct_use, , drop = FALSE]
    # remove CREs with less than minimum accessibility
    m <- m[, colSums(m) > cs_thresh, drop = FALSE]
    # normalize
    m_norm <- peakcluster:::normalize(m, l2 = FALSE, depthnorm = TRUE)
    return(m_norm)
# scatac, brain, devbrain, pbmc_mat,
# adrenal_mat, psoas_muscle, esophagus, heart_fetal,
# heartRV, left_colon, liver
df_thresh_use <- c(0.7, 0.7, 0.7, 0.7,
                   0.4, 0.4, 0.4, 0.4,
for (i in seq_along(scatac_use)) {
    nrm <- normalize_atac(m = scatac_use[[i]],
                         df_thresh = df_thresh_use[i])
    scatac_use[[i]] <- nrm
# remove datasets with 1 cell type remaining
n_ct <- sapply(scatac_use, nrow)
scatac_use[n_ct < 2] <- NULL
# normalize encode with TMM 
encode <- clip_outliers(encode)
dge <- DGEList(counts = t(encode)) # rows are features (peaks), columns are experiments
dge <- calcNormFactors(dge, method = "TMM")
encode <- t(cpm(dge, normalized.lib.sizes = TRUE, log = TRUE))
d_rowmeans <- rowMeans(encode)
d_sd <- apply(encode, 1, sd)
pcs <- irlba(A = t(encode), scale = d_sd, center = d_rowmeans, nv = dims)
emb <- pcs$u %*% diag(pcs$d) # weight by variance
rm(d_rowmeans)
# build graph
neighbors <- hnsw_knn(X = emb[, 2:dims], k = k, distance = "euclidean")
adjacency_matrix <- NeighborMatrix(nn.idx = neighbors$idx)
snn_graph <- GraphSNN(knn = adjacency_matrix)
clusters <- Cluster(graph = snn_graph, resolution = 0.3)
print(table(clusters))
rm(neighbors)
rm(adjacency_matrix)
rm(snn_graph)
# separate encode clusters according to coaccessibility, distance, HiC
subclustering <- SubclusterAll(
    clusters = clusters,
    ccre = ccre_subs,
    scatac_use = scatac_use,
    hic_files = hic_files,
    sigma = 500000,
    chromosome = chromosome,
    method = "pearson"
rownames(subclustering) <- subclustering$ccre
subclustering$ccre <- NULL
ccre_clustered <- merge(ccre_subs, subclustering, by = 'row.names', all = TRUE)
rownames(ccre_clustered) <- ccre_clustered$Row.names
ccre_clustered$Row.names <- NULL
outf <- paste0(outdir, "/", chromosome, ".tsv")
write.table(ccre_clustered, file = outf, sep = "\t", row.names = FALSE, quote = FALSE, col.names = FALSE)
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

cluster_chromosome.R

Latest commit

History

cluster_chromosome.R

File metadata and controls