1. Find location of gene of interest in the assay.
  2. Isolate GeneScore or GeneExpression for that gene for all cells
  3. Calculate distribution of that set.
  4. The minimum score to obtain samples with GeneScore/GeneExpression above 75th quantile.
  5. TRUE/FALSE of values above that minimum value.
  6. Plot distributions of GeneScore/GeneExpression box and whiskers

12a. Load ggplot2 and create boxplot stat functions

library(ggplot2)

get_gim_box_stats <- function(y, upper_limit = max(ren1_gim_dt$ren1_gim) * 1.15) {
  return(data.frame(
    y = 0.95 * upper_limit,
    label = paste(
      "Count =", length(y), "\n",
      "Mean =", round(mean(y), 2), "\n",
      "Median =", round(median(y), 2), "\n"
    )
  ))
}

get_gsm_box_stats <- function(y, upper_limit = max(ren1_gsm_dt$ren1_gsm) * 1.15) {
  return(data.frame(
    y = 0.95 * upper_limit,
    label = paste(
      "Count =", length(y), "\n",
      "Mean =", round(mean(y), 2), "\n",
      "Median =", round(median(y), 2), "\n"
    )
  ))
}

custom_theme_ArchR <- function(base_family = "sans", ...){
  theme_classic(base_family = base_family, base_size = 14, ...) +
  theme(
    axis.line = element_line(size = 0.5),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "transparent"),
    plot.background = element_rect(fill = "transparent", color = NA),
    legend.background = element_rect(fill = "transparent", color = NA),
    legend.box.background = element_rect(fill = "transparent", color = NA),
    aspect.ratio = 1,
    legend.position = "none",
    plot.title = element_text(hjust = 0.5),
    panel.border = element_rect(colour = "black", fill=NA, size=0.5)
  )
}

12b. Get distribution of cells by timepoint

all_cells <- as.data.table(proj5$cellNames)
colnames(all_cells) <- c("cellname")
all_cells[, c("timepoint", "cellID") := tstrsplit(cellname, "#", fixed=TRUE)]
all_cells$timepoint <- ordered(all_cells$timepoint,
                               levels = c("E12", "E18", "P5", "P30"))
table(all_cells$timepoint)
fwrite(as.data.table(table(all_cells$timepoint)), "cells_by_timepoint.csv")
E12 E18 P5 P30
1234 3696 1204 1182

12c. Plot distribution of Ren1 gene expression by timepoint

See: https://github.com/GreenleafLab/ArchR/issues/513

Use weighted scores

gim <- getMatrixFromProject(proj5, "GeneIntegrationMatrix_LSI_Harmony")
gim_impute <- imputeMatrix(assay(gim), getImputeWeights(proj5))

# Plot this as a box and whiskers plot between clusters
ren1_gim <- gim_impute[grep('Ren1', rowData(gim)$name),]
ren1_gim_dt <- as.data.table(ren1_gim)
cellnames <- names(ren1_gim)
ren1_gim_dt[, cellname := cellnames]
ren1_gim_dt[, c("timepoint", "cellID") := tstrsplit(cellname, "#", fixed=TRUE)]

ren1_expression <- data.table(
    timepoint=c(rep("E12", table(all_cells$timepoint)['E12'][[1]]), 
                rep("E18", table(all_cells$timepoint)['E18'][[1]]), 
                rep("P5", table(all_cells$timepoint)['P5'][[1]]), 
                rep("P30", table(all_cells$timepoint)['P30'][[1]])),
    ren1_gim=c(c(ren1_gim_dt[timepoint=="E12",]$ren1_gim,
               rep(0,table(all_cells$timepoint)['E12'][[1]]-
                     length(ren1_gim_dt[timepoint=="E12",]$ren1_gim))),
               c(ren1_gim_dt[timepoint=="E18",]$ren1_gim,
               rep(0,table(all_cells$timepoint)['E18'][[1]]-
                     length(ren1_gim_dt[timepoint=="E18",]$ren1_gim))),
               c(ren1_gim_dt[timepoint=="P5",]$ren1_gim,
               rep(0, table(all_cells$timepoint)['P5'][[1]]-
                      length(ren1_gim_dt[timepoint=="P5",]$ren1_gim))),
               c(ren1_gim_dt[timepoint=="P30",]$ren1_gim,
               rep(0, table(all_cells$timepoint)['P30'][[1]]-
                      length(ren1_gim_dt[timepoint=="P30",]$ren1_gim)))))

ren1_gim_dt$timepoint <- factor(ren1_gim_dt$timepoint,
                                levels = c("E12", "E18", "P5", "P30"))

p <- ggplot(ren1_gim_dt, aes(x=timepoint, y=ren1_gim, col=timepoint)) +
    geom_boxplot() +
    scale_x_discrete("timepoint", drop=FALSE) +
    #geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.1) +
    stat_summary(fun.data = get_gim_box_stats, geom = "text", hjust = 0.5, vjust = 0.9) +
    labs(
        x = "Mouse kidney development day",
        y = "Ren1 Log2 (NormCounts + 1)"
      ) +
    custom_theme_ArchR()
svglite(paste0("Ren1_GIM_expression_boxplot_", Sys.Date(), ".svg"))
print(p)
dev.off()

12d. Plot distribution of Ren1 gene score by timepoint

gsm <- getMatrixFromProject(proj5, "GeneScoreMatrix_no_ribosomal")
gsm_impute <- imputeMatrix(assay(gsm), getImputeWeights(proj5))
ren1_gsm <- gsm_impute[grep('Ren1', rowData(gsm)$name),]
ren1_gsm_dt <- as.data.table(ren1_gsm)
cellnames <- names(ren1_gsm)
ren1_gsm_dt[, cellname := cellnames]
ren1_gsm_dt[, c("timepoint", "cellID") := tstrsplit(cellname, "#", fixed=TRUE)]
ren1_gsm_dt$timepoint <- factor(ren1_gsm_dt$timepoint,
                                levels = c("E12", "E18", "P5", "P30"))

p <- ggplot(ren1_gsm_dt, aes(x=timepoint, y=ren1_gsm, col=timepoint)) +
    geom_boxplot() +
    scale_x_discrete("timepoint", drop=FALSE) +
    #geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 0.1) +
    stat_summary(fun.data = get_gsm_box_stats, geom = "text",
                 hjust = 0.5, vjust = 0.9) +
    labs(
        x = "Mouse kidney development day",
        y = "Ren1 NormCounts"
      ) +
    custom_theme_ArchR()
svglite(paste0("Ren1_GSM_expression_boxplot_", Sys.Date(), ".svg"))
print(p)
dev.off()

save.image(paste0(sample_name, "_ImputeMatrix_", Sys.Date(), ".RData"))

12e. Plot distribution of cells by timepoint

library(forcats)
library(ggalluvial)

all_cells[, cluster := proj5$ManualClusters]
cell_distribution <- as.data.table(table(all_cells$timepoint,
                                         all_cells$cluster))
colnames(cell_distribution) <- c("timepoint", "cluster", "N")
cell_distribution$timepoint <- factor(cell_distribution$timepoint,
                                      levels = c("E12", "E18", "P5", "P30"))
fwrite(cell_distribution, "proj5_cell_distribution.csv", sep=",")
pal <- paletteDiscrete(values = cell_distribution$cluster)

p <- ggplot(cell_distribution, aes(x = timepoint, y = N, fill = cluster)) + 
  geom_bar(position = "fill", stat = "identity", color="black") +
  scale_fill_manual(values = pal) +
  labs(
        x = "Mouse kidney developmental timepoint",
        y = "Frequency"
      ) +
    custom_theme_ArchR() +
    theme(legend.position = "right")
svglite(width=20,
        paste0("Cell_distribution_frequency_stacked_barplot_",
               Sys.Date() ,".svg"))
print(p)
dev.off()

cell_distribution <- cell_distribution[order(timepoint, N),]
cell_pct <- cell_distribution %>%
    group_by(timepoint, cluster) %>%
    summarise(count = N) %>%
    mutate(perc = count/sum(count))
cell_pct$timepoint <- ordered(cell_pct$timepoint,
                              levels = c("E12", "E18", "P5", "P30"))

connected_alluvial_plot <- ggplot(cell_pct,
    aes(y = perc, x = factor(timepoint),
        fill = fct_reorder2(cluster, timepoint, perc, .desc = FALSE))) +
    geom_alluvium(aes(alluvium = fct_reorder2(cluster, timepoint,
                                              perc, .desc = FALSE)),
                  alpha=1, color = "black") +
    scale_fill_manual(NULL, breaks = 17:0, values=pal)+
    scale_y_continuous(expand = c(0,0)) +
    custom_theme_ArchR() +
    theme(legend.position = "right") +
    labs(x = "Mouse kidney developmental timepoint",
         y = "Frequency (%)",
         fill = "cluster")

svglite(width=20,
        paste0("Cell_distribution_alluvial_plot_", Sys.Date() ,".svg"))
print(connected_alluvial_plot)
dev.off()

12f. Identify JG cell marker expression.

  1. Isolate Ren1 GIM
  2. Cells with values above 90th percentile.
  3. Extract names of those cells.
jg_markers <- c("Ren1", "Akr1b7")
#smc_markers <- c("Acta2", "Smtn", "Tagln", "Myh11", "Cnn1")
smc_markers <- c("Acta2", "Myh11", "Cnn1")

Use GeneIntegrationMatrix_LSI_Harmony (GIM)

quantile_cutoff <- 0.9 # Original
jg_GIM_quantile  <- 0.90
smc_GIM_quantile <- 0.75

rm(GIM_jg_names)
for (gene in jg_markers) {
    jg_gim <- gim_impute[grepl(paste0("^", gene, "$"), rowData(gim)$name),]
    jg_upper_quartile <- names(jg_gim[jg_gim > quantile(gim_impute[grep(paste0("^", gene, "$"), rowData(gim)$name),], jg_GIM_quantile)[[1]]])
    if (exists("GIM_jg_names")) {
        GIM_jg_names <- Reduce(intersect, list(GIM_jg_names, jg_upper_quartile))
    } else {
        GIM_jg_names <- jg_upper_quartile
    }
}

rm(GIM_smc_names)
for (gene in smc_markers) {
    smc_gim <- gim_impute[grepl(paste0("^", gene, "$"), rowData(gim)$name),]
    smc_upper_quartile <- names(smc_gim[smc_gim > 
        quantile(gim_impute[grep(paste0("^", gene, "$"),
        rowData(gim)$name),], smc_GIM_quantile)[[1]]])
    if (exists("GIM_smc_names")) {
        GIM_smc_names <- Reduce(intersect, list(GIM_smc_names, smc_upper_quartile))
    } else {
        GIM_smc_names <- smc_upper_quartile
    }
}

Use GeneScoreMatrix (GSM)

jg_GSM_quantile  <- 0.75
smc_GSM_quantile <- 0.75

rm(GSM_jg_names)
for (gene in jg_markers) {
    jg_gsm <- gsm_impute[grepl(paste0("^", gene, "$"), rowData(gsm)$name),]
    jg_upper_quartile <- names(jg_gsm[jg_gsm >
        quantile(gsm_impute[grep(paste0("^", gene, "$"),
        rowData(gsm)$name),], jg_GSM_quantile)[[1]]])
    if (exists("GSM_jg_names")) {
        GSM_jg_names <- Reduce(intersect, list(GSM_jg_names, jg_upper_quartile))
    } else {
        GSM_jg_names <- jg_upper_quartile
    }
}

rm(GSM_smc_names)
for (gene in smc_markers) {
    smc_gsm <- gsm_impute[grepl(paste0("^", gene, "$"), rowData(gsm)$name),]
    smc_upper_quartile <- names(smc_gsm[smc_gsm >
        quantile(gsm_impute[grep(paste0("^", gene, "$"),
        rowData(gsm)$name),], smc_GSM_quantile)[[1]]])
    if (exists("GSM_smc_names")) {
        GSM_smc_names <- Reduce(intersect, list(GSM_smc_names, smc_upper_quartile))
    } else {
        GSM_smc_names <- smc_upper_quartile
    }
}

Intersect cells from both GeneExpression and GeneScore

SMCs <- intersect(GIM_smc_names, GSM_smc_names)
JGs  <- intersect(GIM_jg_names, GSM_jg_names)

# JG progenitors and mature cells
early_JG_index       <- grep(paste(c(JGs[grep("E18", JGs)],
                                     JGs[grep("P5", JGs)]),
                                   collapse="|"),
                             getCellNames(proj5))
late_JG_cell_index   <- grep(paste(JGs[grep("P30", JGs)],
                                   collapse="|"),
                             getCellNames(proj5))
early_SMC_index      <- grep(paste(c(SMCs[grep("E18", SMCs)],
                                     SMCs[grep("P5", SMCs)]),
                                   collapse="|"),
                             getCellNames(proj5))
late_SMC_cell_index  <- grep(paste(SMCs[grep("P30", SMCs)],
                                   collapse="|"),
                             getCellNames(proj5))

12g. Rename original cluster cells to include final JG population and final mature SMC population

Rename clusters based on time of differentiation.

Order of assignment matters here. Want to reassign any VSMCs as JGs where appropriate

proj5$ManualClustersJG <- proj5$ManualClusters
proj5$ManualClustersJG[early_SMC_index]      <- "early_VSMCs"
proj5$ManualClustersJG[late_SMC_cell_index]  <- "late_VSMCs"
proj5$ManualClustersJG[early_JG_index]       <- "early_JGs"
proj5$ManualClustersJG[late_JG_cell_index]   <- "late_JGs"

table(proj5$ManualClustersJG)
Cell Population N
CD_stroma_and_FBs 630
early_JGs 351
early_VSMCs 194
ITSs_and_FBs 54
JGs_MCs_and_PCs 285
JGs_VSMCs_and_PCs 218
late_JGs 217
late_VSMCs 48
MCs_VSMCs_and_JGs 108
MED_and_cortical_stroma_and_FBs 480
MED_stroma_and_FBs 213
mesenchymal_progenitors 719
PCs_VSMCs_and_JGs 72
PCTs 74
PODs 725
proliferating_CD 49
proliferating_cortical_stroma_and_EPIs 856
proliferating_MED_stroma_and_FBs 1094
proliferating_mesenchyme_and_PODs 608
proliferating_stroma_and_PODs 206
ureteric_epithelium 115
fwrite(as.data.table(table(proj5$ManualClustersJG)), "manual-clusters-JG_table.csv")

Now that we've identified the JG and VSMCs in the previous broad clusters, rename those clusters to reflect this change.

JGs_MCs_and_PCs_index <- grep('TRUE', proj5$ManualClustersJG == "JGs_MCs_and_PCs")
proj5$ManualClustersJG[JGs_MCs_and_PCs_index] <- "MCs_and_PCs"

MCs_VSMCs_and_JGs_index <- grep('TRUE', proj5$ManualClustersJG == "MCs_VSMCs_and_JGs")
proj5$ManualClustersJG[MCs_VSMCs_and_JGs_index] <- "MCs"

JGs_VSMCs_and_PCs_index <- grep('TRUE', proj5$ManualClustersJG == "JGs_VSMCs_and_PCs")
proj5$ManualClustersJG[JGs_VSMCs_and_PCs_index] <- "PCs"

PCs_VSMCs_and_JGs_index <- grep('TRUE', proj5$ManualClustersJG == "PCs_VSMCs_and_JGs")
proj5$ManualClustersJG[PCs_VSMCs_and_JGs_index] <- "PCs"

table(proj5$ManualClustersJG)
Cell Population N
CD_stroma_and_FBs 630
early_JGs 351
early_VSMCs 194
ITSs_and_FBs 54
late_JGs 217
late_VSMCs 48
MCs 108
MCs_and_PCs 285
MED_and_cortical_stroma_and_FBs 480
MED_stroma_and_FBs 213
mesenchymal_progenitors 719
PCs 290
PCTs 74
PODs 725
proliferating_CD 49
proliferating_cortical_stroma_and_EPIs 856
proliferating_MED_stroma_and_FBs 1094
proliferating_mesenchyme_and_PODs 608
proliferating_stroma_and_PODs 206
ureteric_epithelium 115

12h. Plot UMAP with annotated JG and SMC

pal2 <- paletteDiscrete(values = proj5$ManualClustersJG)

cell_identity_embeddings <- plotEmbedding(
    proj5,
    embedding = "UMAP_LSI_Harmony",
    colorBy = "cellColData",
    name = "ManualClustersJG",
    rastr = FALSE,
    pal = pal2
)

svglite(paste0(sample_name,
               "_ManualClustersJG_UMAP_", Sys.Date(), ".svg"),
        height=5, width=10)
print(cell_identity_embeddings)
dev.off()

12i. Plot gene markers on UMAP

genes_of_interest <- c("Ren1", "Foxd1", "Akr1b7",
                       "Acta2", "Smtn", "Tagln", "Myh11", "Cnn1")
for (gene in genes_of_interest) {
    if (!file.exists(paste0(sample_name,
                            "_manual_cell_identities_JG_UMAP_GIM-",
                            gene, "_", Sys.Date(), ".svg"))) {
        p1 <- plotEmbedding(
            ArchRProj = proj5, 
            colorBy = "GeneIntegrationMatrix_LSI_Harmony", 
            name = gene, 
            continuousSet = "horizonExtra",
            embedding = "UMAP_LSI_Harmony",
            imputeWeights = getImputeWeights(proj5),
            threads=getArchRThreads()/2
        )

        p2 <- plotEmbedding(
            ArchRProj = proj5, 
            colorBy = "GeneScoreMatrix_no_ribosomal", 
            continuousSet = "horizonExtra",
            name = gene, 
            embedding = "UMAP_LSI_Harmony",
            imputeWeights = getImputeWeights(proj5),
            threads=getArchRThreads()/2
        )

        svglite(paste0(sample_name,
                       "_ManualClustersJG_UMAP_GIM-",
                       gene, "_", Sys.Date(), ".svg"))
        print(p1)
        dev.off()

        svglite(paste0(sample_name,
                       "_ManualClustersJG_UMAP_GSM-",
                       gene, "_", Sys.Date(), ".svg"))
        print(p2)
        dev.off()
    }
}