9a. Integrate with scATAC data
all_sce <- as.SingleCellExperiment(seurat_named)
proj3 <- addGeneIntegrationMatrix(
ArchRProj = proj3,
useMatrix = "GeneScoreMatrix",
matrixName = "GeneIntegrationMatrix_LSI_Harmony",
reducedDims = "LSI_Harmony",
seRNA = all_sce,
addToArrow = FALSE,
groupRNA = "ident",
nameCell = "predictedCell_Un_LSI_Harmony",
nameGroup = "predictedGroup_Un_LSI_Harmony",
nameScore = "predictedScore_Un_LSI_Harmony"
)
cM <- as.matrix(confusionMatrix(proj3$Clusters_LSI, proj3$predictedGroup_Un_LSI_Harmony))
preClust <- colnames(cM)[apply(cM, 1 , which.max)]
#show which scRNA-seq cell type is most abundant in each of the scATAC-seq clusters
cbind(preClust, rownames(cM)) #Assignments
unique(unique(proj3$predictedGroup_Un_LSI_Harmony))
9b. Get cell IDs for each scRNA-seq cluster
unique(colData(all_sce)$ident)
rnaMP <- colnames(all_sce)[grep("mesenchymal_progenitors",
colData(all_sce)$ident)]
rnaProMyeloid <- colnames(all_sce)[
grep("proliferating_myeloid_and_stromal_cells", colData(all_sce)$ident)]
rnaImmune <- colnames(all_sce)[grep("immune_cells", colData(all_sce)$ident)]
rnaCDFB <- colnames(all_sce)[grep("CD_stroma_and_FBs", colData(all_sce)$ident)]
rnaCorticalEPI <- colnames(all_sce)[
grep("cortical_stroma_and_EPIs", colData(all_sce)$ident)]
rnaProStromaENDO <- colnames(all_sce)[
grep("proliferating_stroma_and_ENDOs", colData(all_sce)$ident)]
rnaMedCortFB <- colnames(all_sce)[
grep("MED_and_cortical_stroma_and_FBs", colData(all_sce)$ident)]
rnaITSFB <- colnames(all_sce)[grep("ITSs_and_FBs", colData(all_sce)$ident)]
rnaMedStromaFB <- colnames(all_sce)[
grep("MED_stroma_and_FBs", colData(all_sce)$ident)]
rnaJGMCPC <- colnames(all_sce)[grep("JGs_MCs_and_PCs", colData(all_sce)$ident)]
rnaMCVSMCJG <- colnames(all_sce)[
grep("MCs_VSMCs_and_JGs", colData(all_sce)$ident)]
rnaPCVSMCJG <- colnames(all_sce)[
grep("PCs_VSMCs_and_JGs", colData(all_sce)$ident)]
rnaPCT <- colnames(all_sce)[grep("PCTs", colData(all_sce)$ident)]
rnaPOD1 <- colnames(all_sce)[grep("POD_1", colData(all_sce)$ident)]
rnaPOD2 <- colnames(all_sce)[grep("POD_2", colData(all_sce)$ident)]
rnaProCD <- colnames(all_sce)[
grep("proliferating_CD", colData(all_sce)$ident)]
rnaProStroma <- colnames(all_sce)[
grep("proliferating_stroma", colData(all_sce)$ident)]
rnaProStromaPODs <- colnames(all_sce)[
grep("proliferating_stroma_and_PODs", colData(all_sce)$ident)]
rnaProStromaTubule <- colnames(all_sce)[
grep("proliferating_stroma_and_tubule_cells", colData(all_sce)$ident)]
rnaUE <- colnames(all_sce)[grep("ureteric_epithelium", colData(all_sce)$ident)]
rnaUSFB <- colnames(all_sce)[
grep("ureteric_stroma_and_FBs", colData(all_sce)$ident)]
9c. Create groups of integrated scATAC-seq and scRNA-seq cells by IDs
# "proliferating_CD" "C17"
groupList <- SimpleList(
MedCortFB = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C6")],
RNA = rnaMedCortFB
),
MP = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C10", "C11")],
RNA = rnaMP
),
PCVSMCJG = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C2", "C12", "C13")],
RNA = rnaPCVSMCJG
),
POD1 = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C19", "C20")],
RNA = rnaPOD1
),
PCT = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C18")],
RNA = rnaPCT
),
ProStroma = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C3")],
RNA = rnaProStroma
),
MedStromaFB = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C7", "C8")],
RNA = rnaMedStromaFB
),
CDFB = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C9", "C17")],
RNA = rnaCDFB
),
ProStromaPODs = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C5")],
RNA = rnaProStromaPODs
),
JGMCPC = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C15", "C16")],
RNA = rnaJGMCPC
),
ITSFB = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C1")],
RNA = rnaITSFB
),
UE = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C4")],
RNA = rnaUE
),
MCVSMCJG = SimpleList(
ATAC = proj3$cellNames[proj3$Clusters_LSI %in% c("C4")],
RNA = rnaMCVSMCJG
)
)
# Use the unconstrained naming
proj3 <- addGeneIntegrationMatrix(
ArchRProj = proj3,
useMatrix = "GeneScoreMatrix",
matrixName = "GeneIntegrationMatrix_LSI_Harmony",
reducedDims = "LSI_Harmony",
seRNA = all_sce,
addToArrow = FALSE,
groupList = groupList,
groupRNA = "ident",
nameCell = "predictedCell_Co_LSI_Harmony",
nameGroup = "predictedGroup_Co_LSI_Harmony",
nameScore = "predictedScore_Co_LSI_Harmony",
verbose=TRUE,
threads=1
)
# Set colors for clusters
pal <- paletteDiscrete(values = colData(all_sce)$ident)
# Plot unconstrained RNA-seq integration
p1 <- plotEmbedding(
proj3,
embedding = "UMAP_LSI_Harmony",
colorBy = "cellColData",
name = "predictedGroup_Un_LSI_Harmony",
pal = pal
)
svglite(paste0(sample_name,
"_unconstrained_integrated_UMAP_",
Sys.Date(), ".svg"))
p1
dev.off()
# Plot constrained RNA-seq integration
p2 <- plotEmbedding(
proj3,
embedding = "UMAP_LSI_Harmony",
colorBy = "cellColData",
name = "predictedGroup_Co_LSI_Harmony",
pal = pal
)
svglite(paste0(sample_name,
"_constrained_integrated_UMAP_",
Sys.Date(), ".svg"))
p2
dev.off()
# add the linked gene expression data to each of the Arrow files
proj3 <- addGeneIntegrationMatrix(
ArchRProj = proj3,
useMatrix = "GeneScoreMatrix",
matrixName = "GeneIntegrationMatrix_LSI_Harmony",
reducedDims = "LSI_Harmony",
seRNA = all_sce,
addToArrow = TRUE,
force= TRUE,
groupList = groupList,
groupRNA = "ident",
nameCell = "predictedCell_LSI_Harmony",
nameGroup = "predictedGroup_LSI_Harmony",
nameScore = "predictedScore_LSI_Harmony",
threads=1
)
getAvailableMatrices(proj3)
proj3 <- addImputeWeights(proj3)
9d. Set clusters2 manually
Using information from the constrained and unconstrained integration, we can look at the most correlated cell labels stemming from the RNA-seq data to manually assign cluster identities.
Compare top labels for each cluster.
fwrite(cM, paste0("cell_labels_Clusters-LSI_", Sys.Date(), ".csv"), sep=",",
row.names=TRUE)
labelUn <- rownames(cM)
labelUn
maxCellLabel <- colnames(cM)[apply(cM, 1, which.max)]
Max hit for cell type. Take the combined cell clusterings to better clarify each cluster.
cbind(maxCellLabel, labelUn)
library(kit)
top_cell_cluster_labels <- cbind(labelUn,
data.table(first=colnames(cM)[apply(cM, 1, kit::topn,3)[1,]],
second=colnames(cM)[apply(cM, 1, kit::topn,3)[2,]],
third=colnames(cM)[apply(cM, 1, kit::topn,3)[3,]]))
fwrite(top_cell_cluster_labels,
paste0("ranked_cell_labels_Clusters-LSI_", Sys.Date(), ".csv"), sep=",")
Manually set the clusters based on this information. Basically, look at the top overlapping cell types and call clusters as a combination of these markers.
Abbreviations: CD = collecting duct EPIs = epithelial cells FBs = fibroblasts JGs = juxtaglomerular cells ITsS = interstitial cells MED = medullary MCs = mesangial cells PCs = perictyes PCTs = proximal convoluted tubule cells PODs = podocytes VSMCs = vascular smooth muscle cells
remapClust <- c(
"C1"="ITSs_and_FBs",
"C2"="PCs_VSMCs_and_JGs",
"C3"=proliferating_cortical_stroma_and_EPIs,
"C4"="ureteric_epithelium",
"C5"="proliferating_stroma_and_PODs",
"C6"="MED_and_cortical_stroma_and_FBs",
"C7"="proliferating_MED_stroma_and_FBs",
"C8"="MED_stroma_and_FBs",
"C9"="CD_stroma_and_FBs",
"C10"="proliferating_mesenchyme_and_PODs",
"C11"="mesenchymal_progenitors",
"C12"="PCs_VSMCs_and_JGs",
"C13"="JGs_VSMCs_and_PCs",
"C14"="JGs_MCs_and_PCs",
"C15"="MCs_VSMCs_and_JGs",
"C16"="MCs_VSMCs_and_JGs",
"C17"="proliferating_CD",
"C18"="PCTs",
"C19"="PODs",
"C20"="PODs"
)
labelManual <- mapLabels(labelUn,
oldLabels = names(remapClust),
newLabels = remapClust)
labelManual
proj3$Clusters2_LSI <- mapLabels(proj3$Clusters_LSI,
newLabels = labelManual,
oldLabels = labelUn)
pal2 <- paletteDiscrete(values = proj3$Clusters2_LSI)
p1manual <- plotEmbedding(
proj3,
embedding = "UMAP_LSI_Harmony",
colorBy = "cellColData",
name = "Clusters2_LSI",
rastr = FALSE,
pal = pal2
)
svglite(paste0(sample_name, "_manual_cell_identities_UMAP_",
Sys.Date(), ".svg"),
height=5, width=10)
p1manual
dev.off()
save.image(paste0(sample_name, "_Manual-Annotation_", Sys.Date(), ".RData"))
9e. Use manual clusters for downstream processing.
Now, let's add those clusters directly with the manual cell labels, and then add pseudo-bulk replicates and call peaks on this data. We'll use the cluster derived pseudo-bulk replicates for downstream peak analysis.
proj4 <- proj3
proj4$ManualClusters <- proj3$Clusters2_LSI
# Adding more pseudo-bulk replicates
proj4 <- addGroupCoverages(ArchRProj = proj4,
groupBy = "ManualClusters",
maxReplicates = 10,
maxFragments = 100 * 10^6)
# Adding peaks again
proj4 <- addReproduciblePeakSet(
ArchRProj = proj4,
groupBy = "ManualClusters"
)
proj4_peakSet <- as.data.table(getPeakSet(proj4))
svglite("scATAC-seq_Peak_GenomicAnnotationDistribution.svg")
print(ggplot(proj4_peakSet, aes(x=peakType,
group=GroupReplicate,
fill=GroupReplicate)) +
geom_bar(color="black"))
dev.off()
proj5 <- addPeakMatrix(proj4)
getAvailableMatrices(proj5)
proj5 <- addImputeWeights(ArchRProj = proj5)
save.image(paste0(sample_name,
"_scATAC-seq_Manual-Annotation_", Sys.Date(), ".RData"))