9a. Integrate with scATAC data

all_sce <- as.SingleCellExperiment(seurat_named)

proj3 <- addGeneIntegrationMatrix(
    ArchRProj = proj3, 
    useMatrix = "GeneScoreMatrix",
    matrixName = "GeneIntegrationMatrix_LSI_Harmony",
    reducedDims = "LSI_Harmony",
    seRNA = all_sce,
    addToArrow = FALSE,
    groupRNA = "ident",
    nameCell = "predictedCell_Un_LSI_Harmony",
    nameGroup = "predictedGroup_Un_LSI_Harmony",
    nameScore = "predictedScore_Un_LSI_Harmony"
)

cM <- as.matrix(confusionMatrix(proj3$Clusters_LSI, proj3$predictedGroup_Un_LSI_Harmony))
preClust <- colnames(cM)[apply(cM, 1 , which.max)]

#show which scRNA-seq cell type is most abundant in each of the scATAC-seq clusters
cbind(preClust, rownames(cM)) #Assignments

unique(unique(proj3$predictedGroup_Un_LSI_Harmony))

9b. Get cell IDs for each scRNA-seq cluster

unique(colData(all_sce)$ident)

rnaMP <- colnames(all_sce)[grep("mesenchymal_progenitors",
                                 colData(all_sce)$ident)]

rnaProMyeloid <- colnames(all_sce)[
    grep("proliferating_myeloid_and_stromal_cells", colData(all_sce)$ident)]

rnaImmune <- colnames(all_sce)[grep("immune_cells", colData(all_sce)$ident)]

rnaCDFB <- colnames(all_sce)[grep("CD_stroma_and_FBs", colData(all_sce)$ident)]

rnaCorticalEPI <- colnames(all_sce)[
    grep("cortical_stroma_and_EPIs", colData(all_sce)$ident)]

rnaProStromaENDO <- colnames(all_sce)[
    grep("proliferating_stroma_and_ENDOs", colData(all_sce)$ident)]

rnaMedCortFB <- colnames(all_sce)[
    grep("MED_and_cortical_stroma_and_FBs", colData(all_sce)$ident)]

rnaITSFB <- colnames(all_sce)[grep("ITSs_and_FBs", colData(all_sce)$ident)]

rnaMedStromaFB <- colnames(all_sce)[
    grep("MED_stroma_and_FBs", colData(all_sce)$ident)]

rnaJGMCPC <- colnames(all_sce)[grep("JGs_MCs_and_PCs", colData(all_sce)$ident)]

rnaMCVSMCJG <- colnames(all_sce)[
    grep("MCs_VSMCs_and_JGs", colData(all_sce)$ident)]

rnaPCVSMCJG <- colnames(all_sce)[
    grep("PCs_VSMCs_and_JGs", colData(all_sce)$ident)]

rnaPCT <- colnames(all_sce)[grep("PCTs", colData(all_sce)$ident)]

rnaPOD1 <- colnames(all_sce)[grep("POD_1", colData(all_sce)$ident)]

rnaPOD2 <- colnames(all_sce)[grep("POD_2", colData(all_sce)$ident)]

rnaProCD <- colnames(all_sce)[
    grep("proliferating_CD", colData(all_sce)$ident)]

rnaProStroma <- colnames(all_sce)[
    grep("proliferating_stroma", colData(all_sce)$ident)]

rnaProStromaPODs <- colnames(all_sce)[
    grep("proliferating_stroma_and_PODs", colData(all_sce)$ident)]

rnaProStromaTubule <- colnames(all_sce)[
    grep("proliferating_stroma_and_tubule_cells", colData(all_sce)$ident)]

rnaUE <- colnames(all_sce)[grep("ureteric_epithelium", colData(all_sce)$ident)]

rnaUSFB <- colnames(all_sce)[
    grep("ureteric_stroma_and_FBs", colData(all_sce)$ident)]

9c. Create groups of integrated scATAC-seq and scRNA-seq cells by IDs

# "proliferating_CD"                "C17"

groupList <- SimpleList(
    MedCortFB = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C6")],
        RNA = rnaMedCortFB
    ),
    MP = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C10", "C11")],
        RNA = rnaMP
    ),
    PCVSMCJG = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C2", "C12", "C13")],
        RNA = rnaPCVSMCJG
    ),
    POD1 = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C19", "C20")],
        RNA = rnaPOD1
    ),
    PCT = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C18")],
        RNA = rnaPCT
    ),
    ProStroma = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C3")],
        RNA = rnaProStroma
    ),
    MedStromaFB = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C7", "C8")],
        RNA = rnaMedStromaFB
    ),
    CDFB = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C9", "C17")],
        RNA = rnaCDFB
    ),
    ProStromaPODs = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C5")],
        RNA = rnaProStromaPODs
    ),
    JGMCPC = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C15", "C16")],
        RNA = rnaJGMCPC
    ),
    ITSFB = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C1")],
        RNA = rnaITSFB
    ),
    UE = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C4")],
        RNA = rnaUE
    ),
    MCVSMCJG = SimpleList(
        ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C4")],
        RNA = rnaMCVSMCJG
    )
)

# Use the unconstrained naming

proj3 <- addGeneIntegrationMatrix(
    ArchRProj = proj3, 
    useMatrix = "GeneScoreMatrix",
    matrixName = "GeneIntegrationMatrix_LSI_Harmony",
    reducedDims = "LSI_Harmony",
    seRNA = all_sce,
    addToArrow = FALSE, 
    groupList = groupList,
    groupRNA = "ident",
    nameCell = "predictedCell_Co_LSI_Harmony",
    nameGroup = "predictedGroup_Co_LSI_Harmony",
    nameScore = "predictedScore_Co_LSI_Harmony",
    verbose=TRUE,
    threads=1
)

# Set colors for clusters
pal <- paletteDiscrete(values = colData(all_sce)$ident)

# Plot unconstrained RNA-seq integration
p1 <- plotEmbedding(
    proj3, 
    embedding = "UMAP_LSI_Harmony",
    colorBy = "cellColData", 
    name = "predictedGroup_Un_LSI_Harmony", 
    pal = pal
)

svglite(paste0(sample_name,
               "_unconstrained_integrated_UMAP_",
               Sys.Date(), ".svg"))
p1
dev.off()

# Plot constrained RNA-seq integration
p2 <- plotEmbedding(
    proj3,
    embedding = "UMAP_LSI_Harmony",
    colorBy = "cellColData", 
    name = "predictedGroup_Co_LSI_Harmony", 
    pal = pal
)

svglite(paste0(sample_name,
               "_constrained_integrated_UMAP_",
               Sys.Date(), ".svg"))
p2
dev.off()


# add the linked gene expression data to each of the Arrow files
proj3 <- addGeneIntegrationMatrix(
    ArchRProj = proj3, 
    useMatrix = "GeneScoreMatrix",
    matrixName = "GeneIntegrationMatrix_LSI_Harmony",
    reducedDims = "LSI_Harmony",
    seRNA = all_sce,
    addToArrow = TRUE,
    force= TRUE,
    groupList = groupList,
    groupRNA = "ident",
    nameCell = "predictedCell_LSI_Harmony",
    nameGroup = "predictedGroup_LSI_Harmony",
    nameScore = "predictedScore_LSI_Harmony",
    threads=1
)

getAvailableMatrices(proj3)
proj3 <- addImputeWeights(proj3)

9d. Set clusters2 manually

Using information from the constrained and unconstrained integration, we can look at the most correlated cell labels stemming from the RNA-seq data to manually assign cluster identities.

Compare top labels for each cluster.

fwrite(cM, paste0("cell_labels_Clusters-LSI_", Sys.Date(), ".csv"), sep=",",
       row.names=TRUE) 
labelUn <- rownames(cM)
labelUn
maxCellLabel <- colnames(cM)[apply(cM, 1, which.max)]

Max hit for cell type. Take the combined cell clusterings to better clarify each cluster.

cbind(maxCellLabel, labelUn)
library(kit)
top_cell_cluster_labels <- cbind(labelUn,
    data.table(first=colnames(cM)[apply(cM, 1, kit::topn,3)[1,]],
               second=colnames(cM)[apply(cM, 1, kit::topn,3)[2,]],
               third=colnames(cM)[apply(cM, 1, kit::topn,3)[3,]]))
fwrite(top_cell_cluster_labels,
       paste0("ranked_cell_labels_Clusters-LSI_", Sys.Date(), ".csv"), sep=",")

Manually set the clusters based on this information. Basically, look at the top overlapping cell types and call clusters as a combination of these markers.

Abbreviations: CD = collecting duct EPIs = epithelial cells FBs = fibroblasts JGs = juxtaglomerular cells ITsS = interstitial cells MED = medullary MCs = mesangial cells PCs = perictyes PCTs = proximal convoluted tubule cells PODs = podocytes VSMCs = vascular smooth muscle cells

remapClust <- c(
    "C1"="ITSs_and_FBs",
    "C2"="PCs_VSMCs_and_JGs",
    "C3"=proliferating_cortical_stroma_and_EPIs,
    "C4"="ureteric_epithelium",
    "C5"="proliferating_stroma_and_PODs",
    "C6"="MED_and_cortical_stroma_and_FBs",
    "C7"="proliferating_MED_stroma_and_FBs",
    "C8"="MED_stroma_and_FBs",
    "C9"="CD_stroma_and_FBs",
    "C10"="proliferating_mesenchyme_and_PODs",
    "C11"="mesenchymal_progenitors",
    "C12"="PCs_VSMCs_and_JGs",
    "C13"="JGs_VSMCs_and_PCs",
    "C14"="JGs_MCs_and_PCs",
    "C15"="MCs_VSMCs_and_JGs",
    "C16"="MCs_VSMCs_and_JGs",
    "C17"="proliferating_CD",
    "C18"="PCTs",
    "C19"="PODs",
    "C20"="PODs"
)

labelManual <- mapLabels(labelUn,
                         oldLabels = names(remapClust),
                         newLabels = remapClust)
labelManual

proj3$Clusters2_LSI <- mapLabels(proj3$Clusters_LSI, 
                                 newLabels = labelManual,
                                 oldLabels = labelUn)

pal2 <- paletteDiscrete(values = proj3$Clusters2_LSI)

p1manual <- plotEmbedding(
    proj3,
    embedding = "UMAP_LSI_Harmony",
    colorBy = "cellColData",
    name = "Clusters2_LSI",
    rastr = FALSE,
    pal = pal2
)
svglite(paste0(sample_name, "_manual_cell_identities_UMAP_",
           Sys.Date(), ".svg"),
    height=5, width=10)
p1manual
dev.off()

save.image(paste0(sample_name, "_Manual-Annotation_", Sys.Date(), ".RData"))

9e. Use manual clusters for downstream processing.

Now, let's add those clusters directly with the manual cell labels, and then add pseudo-bulk replicates and call peaks on this data. We'll use the cluster derived pseudo-bulk replicates for downstream peak analysis.

proj4 <- proj3
proj4$ManualClusters <- proj3$Clusters2_LSI

# Adding more pseudo-bulk replicates
proj4 <- addGroupCoverages(ArchRProj = proj4,
                            groupBy = "ManualClusters",
                            maxReplicates = 10,
                            maxFragments = 100 * 10^6)

# Adding peaks again
proj4 <- addReproduciblePeakSet(
    ArchRProj = proj4, 
    groupBy = "ManualClusters"
)
proj4_peakSet <- as.data.table(getPeakSet(proj4))
svglite("scATAC-seq_Peak_GenomicAnnotationDistribution.svg")
print(ggplot(proj4_peakSet, aes(x=peakType,
                                group=GroupReplicate,
                                fill=GroupReplicate)) +
        geom_bar(color="black"))
dev.off()

proj5 <- addPeakMatrix(proj4)
getAvailableMatrices(proj5)

proj5 <- addImputeWeights(ArchRProj = proj5)

save.image(paste0(sample_name,
                  "_scATAC-seq_Manual-Annotation_", Sys.Date(), ".RData"))