- Find location of gene of interest in the assay.
- Isolate GeneScore or GeneExpression for that gene for all cells
- Calculate distribution of that set.
- The minimum score to obtain samples with GeneScore/GeneExpression above 75th quantile.
- TRUE/FALSE of values above that minimum value.
- Plot distributions of GeneScore/GeneExpression box and whiskers
12a. Load ggplot2 and create boxplot stat functions
library(ggplot2)
get_gim_box_stats <- function(y, upper_limit = max(ren1_gim_dt$ren1_gim) * 1.15) {
return(data.frame(
y = 0.95 * upper_limit,
label = paste(
"Count =", length(y), "\n",
"Mean =", round(mean(y), 2), "\n",
"Median =", round(median(y), 2), "\n"
)
))
}
get_gsm_box_stats <- function(y, upper_limit = max(ren1_gsm_dt$ren1_gsm) * 1.15) {
return(data.frame(
y = 0.95 * upper_limit,
label = paste(
"Count =", length(y), "\n",
"Mean =", round(mean(y), 2), "\n",
"Median =", round(median(y), 2), "\n"
)
))
}
custom_theme_ArchR <- function(base_family = "sans", ...){
theme_classic(base_family = base_family, base_size = 14, ...) +
theme(
axis.line = element_line(size = 0.5),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "transparent"),
plot.background = element_rect(fill = "transparent", color = NA),
legend.background = element_rect(fill = "transparent", color = NA),
legend.box.background = element_rect(fill = "transparent", color = NA),
aspect.ratio = 1,
legend.position = "none",
plot.title = element_text(hjust = 0.5),
panel.border = element_rect(colour = "black", fill=NA, size=0.5)
)
}
12b. Get distribution of cells by timepoint
all_cells <- as.data.table(proj5$cellNames)
colnames(all_cells) <- c("cellname")
all_cells[, c("timepoint", "cellID") := tstrsplit(cellname, "#", fixed=TRUE)]
all_cells$timepoint <- ordered(all_cells$timepoint,
levels = c("E12", "E18", "P5", "P30"))
table(all_cells$timepoint)
fwrite(as.data.table(table(all_cells$timepoint)), "cells_by_timepoint.csv")
| E12 | E18 | P5 | P30 |
|---|---|---|---|
| 1234 | 3696 | 1204 | 1182 |
12c. Plot distribution of Ren1 gene expression by timepoint
See: https://github.com/GreenleafLab/ArchR/issues/513
Use weighted scores
gim <- getMatrixFromProject(proj5, "GeneIntegrationMatrix_LSI_Harmony")
gim_impute <- imputeMatrix(assay(gim), getImputeWeights(proj5))
# Plot this as a box and whiskers plot between clusters
ren1_gim <- gim_impute[grep('Ren1', rowData(gim)$name),]
ren1_gim_dt <- as.data.table(ren1_gim)
cellnames <- names(ren1_gim)
ren1_gim_dt[, cellname := cellnames]
ren1_gim_dt[, c("timepoint", "cellID") := tstrsplit(cellname, "#", fixed=TRUE)]
ren1_expression <- data.table(
timepoint=c(rep("E12", table(all_cells$timepoint)['E12'][[1]]),
rep("E18", table(all_cells$timepoint)['E18'][[1]]),
rep("P5", table(all_cells$timepoint)['P5'][[1]]),
rep("P30", table(all_cells$timepoint)['P30'][[1]])),
ren1_gim=c(c(ren1_gim_dt[timepoint=="E12",]$ren1_gim,
rep(0,table(all_cells$timepoint)['E12'][[1]]-
length(ren1_gim_dt[timepoint=="E12",]$ren1_gim))),
c(ren1_gim_dt[timepoint=="E18",]$ren1_gim,
rep(0,table(all_cells$timepoint)['E18'][[1]]-
length(ren1_gim_dt[timepoint=="E18",]$ren1_gim))),
c(ren1_gim_dt[timepoint=="P5",]$ren1_gim,
rep(0, table(all_cells$timepoint)['P5'][[1]]-
length(ren1_gim_dt[timepoint=="P5",]$ren1_gim))),
c(ren1_gim_dt[timepoint=="P30",]$ren1_gim,
rep(0, table(all_cells$timepoint)['P30'][[1]]-
length(ren1_gim_dt[timepoint=="P30",]$ren1_gim)))))
ren1_gim_dt$timepoint <- factor(ren1_gim_dt$timepoint,
levels = c("E12", "E18", "P5", "P30"))
p <- ggplot(ren1_gim_dt, aes(x=timepoint, y=ren1_gim, col=timepoint)) +
geom_boxplot() +
scale_x_discrete("timepoint", drop=FALSE) +
#geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.1) +
stat_summary(fun.data = get_gim_box_stats, geom = "text", hjust = 0.5, vjust = 0.9) +
labs(
x = "Mouse kidney development day",
y = "Ren1 Log2 (NormCounts + 1)"
) +
custom_theme_ArchR()
svglite(paste0("Ren1_GIM_expression_boxplot_", Sys.Date(), ".svg"))
print(p)
dev.off()
12d. Plot distribution of Ren1 gene score by timepoint
gsm <- getMatrixFromProject(proj5, "GeneScoreMatrix_no_ribosomal")
gsm_impute <- imputeMatrix(assay(gsm), getImputeWeights(proj5))
ren1_gsm <- gsm_impute[grep('Ren1', rowData(gsm)$name),]
ren1_gsm_dt <- as.data.table(ren1_gsm)
cellnames <- names(ren1_gsm)
ren1_gsm_dt[, cellname := cellnames]
ren1_gsm_dt[, c("timepoint", "cellID") := tstrsplit(cellname, "#", fixed=TRUE)]
ren1_gsm_dt$timepoint <- factor(ren1_gsm_dt$timepoint,
levels = c("E12", "E18", "P5", "P30"))
p <- ggplot(ren1_gsm_dt, aes(x=timepoint, y=ren1_gsm, col=timepoint)) +
geom_boxplot() +
scale_x_discrete("timepoint", drop=FALSE) +
#geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.1) +
stat_summary(fun.data = get_gsm_box_stats, geom = "text",
hjust = 0.5, vjust = 0.9) +
labs(
x = "Mouse kidney development day",
y = "Ren1 NormCounts"
) +
custom_theme_ArchR()
svglite(paste0("Ren1_GSM_expression_boxplot_", Sys.Date(), ".svg"))
print(p)
dev.off()
save.image(paste0(sample_name, "_ImputeMatrix_", Sys.Date(), ".RData"))
12e. Plot distribution of cells by timepoint
library(forcats)
library(ggalluvial)
all_cells[, cluster := proj5$ManualClusters]
cell_distribution <- as.data.table(table(all_cells$timepoint,
all_cells$cluster))
colnames(cell_distribution) <- c("timepoint", "cluster", "N")
cell_distribution$timepoint <- factor(cell_distribution$timepoint,
levels = c("E12", "E18", "P5", "P30"))
fwrite(cell_distribution, "proj5_cell_distribution.csv", sep=",")
pal <- paletteDiscrete(values = cell_distribution$cluster)
p <- ggplot(cell_distribution, aes(x = timepoint, y = N, fill = cluster)) +
geom_bar(position = "fill", stat = "identity", color="black") +
scale_fill_manual(values = pal) +
labs(
x = "Mouse kidney developmental timepoint",
y = "Frequency"
) +
custom_theme_ArchR() +
theme(legend.position = "right")
svglite(width=20,
paste0("Cell_distribution_frequency_stacked_barplot_",
Sys.Date() ,".svg"))
print(p)
dev.off()
cell_distribution <- cell_distribution[order(timepoint, N),]
cell_pct <- cell_distribution %>%
group_by(timepoint, cluster) %>%
summarise(count = N) %>%
mutate(perc = count/sum(count))
cell_pct$timepoint <- ordered(cell_pct$timepoint,
levels = c("E12", "E18", "P5", "P30"))
connected_alluvial_plot <- ggplot(cell_pct,
aes(y = perc, x = factor(timepoint),
fill = fct_reorder2(cluster, timepoint, perc, .desc = FALSE))) +
geom_alluvium(aes(alluvium = fct_reorder2(cluster, timepoint,
perc, .desc = FALSE)),
alpha=1, color = "black") +
scale_fill_manual(NULL, breaks = 17:0, values=pal)+
scale_y_continuous(expand = c(0,0)) +
custom_theme_ArchR() +
theme(legend.position = "right") +
labs(x = "Mouse kidney developmental timepoint",
y = "Frequency (%)",
fill = "cluster")
svglite(width=20,
paste0("Cell_distribution_alluvial_plot_", Sys.Date() ,".svg"))
print(connected_alluvial_plot)
dev.off()
12f. Identify JG cell marker expression.
- Isolate Ren1 GIM
- Cells with values above 90th percentile.
- Extract names of those cells.
jg_markers <- c("Ren1", "Akr1b7")
#smc_markers <- c("Acta2", "Smtn", "Tagln", "Myh11", "Cnn1")
smc_markers <- c("Acta2", "Myh11", "Cnn1")
Use GeneIntegrationMatrix_LSI_Harmony (GIM)
quantile_cutoff <- 0.9 # Original
jg_GIM_quantile <- 0.90
smc_GIM_quantile <- 0.75
rm(GIM_jg_names)
for (gene in jg_markers) {
jg_gim <- gim_impute[grepl(paste0("^", gene, "$"), rowData(gim)$name),]
jg_upper_quartile <- names(jg_gim[jg_gim > quantile(gim_impute[grep(paste0("^", gene, "$"), rowData(gim)$name),], jg_GIM_quantile)[[1]]])
if (exists("GIM_jg_names")) {
GIM_jg_names <- Reduce(intersect, list(GIM_jg_names, jg_upper_quartile))
} else {
GIM_jg_names <- jg_upper_quartile
}
}
rm(GIM_smc_names)
for (gene in smc_markers) {
smc_gim <- gim_impute[grepl(paste0("^", gene, "$"), rowData(gim)$name),]
smc_upper_quartile <- names(smc_gim[smc_gim >
quantile(gim_impute[grep(paste0("^", gene, "$"),
rowData(gim)$name),], smc_GIM_quantile)[[1]]])
if (exists("GIM_smc_names")) {
GIM_smc_names <- Reduce(intersect, list(GIM_smc_names, smc_upper_quartile))
} else {
GIM_smc_names <- smc_upper_quartile
}
}
Use GeneScoreMatrix (GSM)
jg_GSM_quantile <- 0.75
smc_GSM_quantile <- 0.75
rm(GSM_jg_names)
for (gene in jg_markers) {
jg_gsm <- gsm_impute[grepl(paste0("^", gene, "$"), rowData(gsm)$name),]
jg_upper_quartile <- names(jg_gsm[jg_gsm >
quantile(gsm_impute[grep(paste0("^", gene, "$"),
rowData(gsm)$name),], jg_GSM_quantile)[[1]]])
if (exists("GSM_jg_names")) {
GSM_jg_names <- Reduce(intersect, list(GSM_jg_names, jg_upper_quartile))
} else {
GSM_jg_names <- jg_upper_quartile
}
}
rm(GSM_smc_names)
for (gene in smc_markers) {
smc_gsm <- gsm_impute[grepl(paste0("^", gene, "$"), rowData(gsm)$name),]
smc_upper_quartile <- names(smc_gsm[smc_gsm >
quantile(gsm_impute[grep(paste0("^", gene, "$"),
rowData(gsm)$name),], smc_GSM_quantile)[[1]]])
if (exists("GSM_smc_names")) {
GSM_smc_names <- Reduce(intersect, list(GSM_smc_names, smc_upper_quartile))
} else {
GSM_smc_names <- smc_upper_quartile
}
}
Intersect cells from both GeneExpression and GeneScore
SMCs <- intersect(GIM_smc_names, GSM_smc_names)
JGs <- intersect(GIM_jg_names, GSM_jg_names)
# JG progenitors and mature cells
early_JG_index <- grep(paste(c(JGs[grep("E18", JGs)],
JGs[grep("P5", JGs)]),
collapse="|"),
getCellNames(proj5))
late_JG_cell_index <- grep(paste(JGs[grep("P30", JGs)],
collapse="|"),
getCellNames(proj5))
early_SMC_index <- grep(paste(c(SMCs[grep("E18", SMCs)],
SMCs[grep("P5", SMCs)]),
collapse="|"),
getCellNames(proj5))
late_SMC_cell_index <- grep(paste(SMCs[grep("P30", SMCs)],
collapse="|"),
getCellNames(proj5))
12g. Rename original cluster cells to include final JG population and final mature SMC population
Rename clusters based on time of differentiation.
Order of assignment matters here. Want to reassign any VSMCs as JGs where appropriate
proj5$ManualClustersJG <- proj5$ManualClusters
proj5$ManualClustersJG[early_SMC_index] <- "early_VSMCs"
proj5$ManualClustersJG[late_SMC_cell_index] <- "late_VSMCs"
proj5$ManualClustersJG[early_JG_index] <- "early_JGs"
proj5$ManualClustersJG[late_JG_cell_index] <- "late_JGs"
table(proj5$ManualClustersJG)
| Cell Population | N |
|---|---|
| CD_stroma_and_FBs | 630 |
| early_JGs | 351 |
| early_VSMCs | 194 |
| ITSs_and_FBs | 54 |
| JGs_MCs_and_PCs | 285 |
| JGs_VSMCs_and_PCs | 218 |
| late_JGs | 217 |
| late_VSMCs | 48 |
| MCs_VSMCs_and_JGs | 108 |
| MED_and_cortical_stroma_and_FBs | 480 |
| MED_stroma_and_FBs | 213 |
| mesenchymal_progenitors | 719 |
| PCs_VSMCs_and_JGs | 72 |
| PCTs | 74 |
| PODs | 725 |
| proliferating_CD | 49 |
| proliferating_cortical_stroma_and_EPIs | 856 |
| proliferating_MED_stroma_and_FBs | 1094 |
| proliferating_mesenchyme_and_PODs | 608 |
| proliferating_stroma_and_PODs | 206 |
| ureteric_epithelium | 115 |
fwrite(as.data.table(table(proj5$ManualClustersJG)), "manual-clusters-JG_table.csv")
Now that we've identified the JG and VSMCs in the previous broad clusters, rename those clusters to reflect this change.
JGs_MCs_and_PCs_index <- grep('TRUE', proj5$ManualClustersJG == "JGs_MCs_and_PCs")
proj5$ManualClustersJG[JGs_MCs_and_PCs_index] <- "MCs_and_PCs"
MCs_VSMCs_and_JGs_index <- grep('TRUE', proj5$ManualClustersJG == "MCs_VSMCs_and_JGs")
proj5$ManualClustersJG[MCs_VSMCs_and_JGs_index] <- "MCs"
JGs_VSMCs_and_PCs_index <- grep('TRUE', proj5$ManualClustersJG == "JGs_VSMCs_and_PCs")
proj5$ManualClustersJG[JGs_VSMCs_and_PCs_index] <- "PCs"
PCs_VSMCs_and_JGs_index <- grep('TRUE', proj5$ManualClustersJG == "PCs_VSMCs_and_JGs")
proj5$ManualClustersJG[PCs_VSMCs_and_JGs_index] <- "PCs"
table(proj5$ManualClustersJG)
| Cell Population | N |
|---|---|
| CD_stroma_and_FBs | 630 |
| early_JGs | 351 |
| early_VSMCs | 194 |
| ITSs_and_FBs | 54 |
| late_JGs | 217 |
| late_VSMCs | 48 |
| MCs | 108 |
| MCs_and_PCs | 285 |
| MED_and_cortical_stroma_and_FBs | 480 |
| MED_stroma_and_FBs | 213 |
| mesenchymal_progenitors | 719 |
| PCs | 290 |
| PCTs | 74 |
| PODs | 725 |
| proliferating_CD | 49 |
| proliferating_cortical_stroma_and_EPIs | 856 |
| proliferating_MED_stroma_and_FBs | 1094 |
| proliferating_mesenchyme_and_PODs | 608 |
| proliferating_stroma_and_PODs | 206 |
| ureteric_epithelium | 115 |
12h. Plot UMAP with annotated JG and SMC
pal2 <- paletteDiscrete(values = proj5$ManualClustersJG)
cell_identity_embeddings <- plotEmbedding(
proj5,
embedding = "UMAP_LSI_Harmony",
colorBy = "cellColData",
name = "ManualClustersJG",
rastr = FALSE,
pal = pal2
)
svglite(paste0(sample_name,
"_ManualClustersJG_UMAP_", Sys.Date(), ".svg"),
height=5, width=10)
print(cell_identity_embeddings)
dev.off()
12i. Plot gene markers on UMAP
genes_of_interest <- c("Ren1", "Foxd1", "Akr1b7",
"Acta2", "Smtn", "Tagln", "Myh11", "Cnn1")
for (gene in genes_of_interest) {
if (!file.exists(paste0(sample_name,
"_manual_cell_identities_JG_UMAP_GIM-",
gene, "_", Sys.Date(), ".svg"))) {
p1 <- plotEmbedding(
ArchRProj = proj5,
colorBy = "GeneIntegrationMatrix_LSI_Harmony",
name = gene,
continuousSet = "horizonExtra",
embedding = "UMAP_LSI_Harmony",
imputeWeights = getImputeWeights(proj5),
threads=getArchRThreads()/2
)
p2 <- plotEmbedding(
ArchRProj = proj5,
colorBy = "GeneScoreMatrix_no_ribosomal",
continuousSet = "horizonExtra",
name = gene,
embedding = "UMAP_LSI_Harmony",
imputeWeights = getImputeWeights(proj5),
threads=getArchRThreads()/2
)
svglite(paste0(sample_name,
"_ManualClustersJG_UMAP_GIM-",
gene, "_", Sys.Date(), ".svg"))
print(p1)
dev.off()
svglite(paste0(sample_name,
"_ManualClustersJG_UMAP_GSM-",
gene, "_", Sys.Date(), ".svg"))
print(p2)
dev.off()
}
}