看我~带土~神威
length(list.files())
length(dir())
dir()[1]
dir.create("GBM/00_data_download_from_gdc")
#方法1
for (dirname in dir("GBM/gdc_download_20210617_120134.142579/")){
## 使用list.files函数找到rawdata里面单个文件夹下面的压缩文件
file <- list.files(paste0(getwd(),"/GBM/gdc_download_20210617_120134.142579/",dirname),pattern = "*.counts") #找到对应文件夹中的内容,pattern可以是正则表达式
## 使用file.copy函数复制粘贴压缩文件到data_in_one
file.copy(paste0(getwd(),"/GBM/gdc_download_20210617_120134.142579/",dirname,"/",file),"GBM/00_data_download_from_gdc") #复制内容到新的文件夹
}
#方法2
#for (dirname in dir()[2:length(dir())]){
# file <- list.files(dirname,pattern = "*.counts") #找到对应文件夹中的内容,pattern可以是正则表达式
#file.copy(paste0(dirname,"/",file),"GBM/00_data_download_from_gdc") #复制内容到新的文件夹
#}
#提取GBM的信息
metadata <- jsonlite::fromJSON("GBM/metadata.cart.2021-06-17.json")
GBM_df <- data.frame()
for (i in 1:nrow(metadata)){
GBM_df[i,1] <- metadata$file_name[i]
GBM_df[i,2] <- metadata$associated_entities[i][[1]]$entity_submitter_id
}
colnames(GBM_df) <- c("filename","TCGA_id")
#读取GBM数据
test <- data.table::fread(paste0("GbM/00_data_download_from_gdc/",GBM_df$filename[1]))
expr_GBM <- data.frame(matrix(NA,nrow(test),nrow(GBM_df)))
for (i in 1:nrow(GBM_df)) {
print(i)
expr_GBM[,i]= data.table::fread(paste0("GbM/00_data_download_from_gdc/",GBM_df$filename[i]))[,2]
}
#给读入的数据添加列名和基因名称
colnames(expr_GBM) <- GBM_df$TCGA_id
test <- head(expr_GBM)
gene_id <- data.table::fread(paste0("GbM/00_data_download_from_gdc/",GBM_df$filename[1]))$V1
expr_GBM <- cbind(gene_id=gene_id,expr_GBM)
test <- head(expr_GBM)
#去除后5行,保存数据成Rdata格式
test <- tail(expr_GBM$gene_id,10)
test
expr_GBM <- expr_GBM[1:(nrow(expr_GBM)-5),]
save(expr_GBM,file = "expr_GBM.Rdata")
#同理做LGG
dir.create("LGG/00_data_download_from_gdc1")
#方法1
for (dirname in dir("LGG/gdc_download_20210617_121812.431352/")){
## 使用list.files函数找到rawdata里面单个文件夹下面的压缩文件
file <- list.files(paste0(getwd(),"/LGG/gdc_download_20210617_121812.431352/",dirname),pattern = "*.counts") #找到对应文件夹中的内容,pattern可以是正则表达式
## 使用file.copy函数复制粘贴压缩文件到data_in_one
file.copy(paste0(getwd(),"/LGG/gdc_download_20210617_121812.431352/",dirname,"/",file),"LGG/00_data_download_from_gdc1") #复制内容到新的文件夹
}
#提取LGG的信息
metadata1 <- jsonlite::fromJSON("LGG/metadata.cart.2021-06-17(1).json")
LGG_df <- data.frame()
for (i in 1:nrow(metadata1)){
LGG_df[i,1] <- metadata1$file_name[i]
LGG_df[i,2] <- metadata1$associated_entities[i][[1]]$entity_submitter_id
}
colnames(LGG_df) <- c("filename","TCGA_id")
#读取LGG数据
test <- data.table::fread(paste0("LGG/00_data_download_from_gdc1/",LGG_df$filename[1]))
expr_LGG <- data.frame(matrix(NA,nrow(test),nrow(LGG_df)))
for (i in 1:nrow(expr_LGG)) {
print(i)
expr_LGG[,i]= data.table::fread(paste0("LGG/00_data_download_from_gdc1/",LGG_df$filename[i]))[,2]
}
#给读入的数据添加列名和基因名称
colnames(expr_LGG) <- LGG_df$TCGA_id
test <- head(expr_LGG)
gene_id <- data.table::fread(paste0("LGG/00_data_download_from_gdc1/",LGG_df$filename[1]))$V1
expr_LGG <- cbind(gene_id=gene_id,expr_LGG)
test <- head(expr_LGG)
#去除后5行,保存数据成Rdata格式
test <- tail(expr_LGG$gene_id,10)
test
expr_LGG <- expr_LGG[1:(nrow(expr_LGG)-5),]
save(expr_LGG,file = "expr_LGG.Rdata")
##################################################################################################################
load(file = "expr_LGG.Rdata")
load(file = "expr_GBM.Rdata")
test1 <- head(expr_LGG)
test <- head(expr_GBM)
library(dplyr)
library(tibble)
library(tidyr)
expr_LGG_GBM <- expr_LGG %>%
inner_join(expr_GBM,by="gene_id")
test <- head(expr_LGG_GBM)
#构建metadata文件
metadata <- data.frame(sample_id = colnames(expr_LGG_GBM)[-1])
sample <- c(rep("LGG",529),rep("GBM",169))
metadata$sample <- relevel(factor(sample),"LGG")
#构建dds对象
library(DESeq2)
## 第一列有名称,所以tidy=TRUE
dds <-DESeqDataSetFromMatrix(countData=expr_LGG_GBM,
colData=metadata,
design=~sample,
tidy=TRUE)
nrow(dds)
rownames(dds)
#数据过滤,如果一行基因在所有样本中的counts数小于等于1,我们就把他删掉
dds <- dds[rowSums(counts(dds))>1,]
#样本聚类(没必要!卡死!)
vsd <- vst(dds, blind = FALSE)
sampleDists <- dist(t(assay(vsd)))
hc <- hclust(sampleDists, method = "ward.D2")
plot(hc, hang = -1)
library(factoextra)
res <- hcut(sampleDists, k = 2, stand = TRUE)
# Visualize
fviz_dend(res,
# 加边框
rect = TRUE,
# 边框颜色
rect_border="cluster",
# 边框线条类型
rect_lty=2,
# 边框线条粗细
lwd=1.2,
# 边框填充
rect_fill = T,
# 字体大小
cex = 1,
# 字体颜色
color_labels_by_k=T,
# 平行放置
horiz=T)
####################################################################################
#Deseq2 计算(非常耗时,建议保存)
dds <- DESeq(dds)
save(dds,file = "dds.Rdata")
####################################################################################
#得到dds之后,我们可以通过counts这个函数得到能作图的标注化后的counts数据,他矫正了样本间测序的深度,使得样本间可以直接比较
normalized_counts <- as.data.frame(counts(dds, normalized=TRUE))
test <- head(normalized_counts)
p<-plotCounts(dds, gene = "ENSG00000000460.15", intgroup=c("sample"))
plotdata <- plotCounts(dds, gene = "ENSG00000072682.17", intgroup=c("sample"),returnData = T)
library(ggplot2)
ggplot(plotdata,aes(x=sample,y=count,col=sample))+
geom_point()+
theme_bw()
contrast <- c("sample", "LGG", "GBM")
dd1 <- results(dds, contrast=contrast, alpha = 0.05)
plotMA(dd1, ylim=c(-2,2))
BiocManager::install("apeglm")
#install.packages("ashr")
#install.packages("irlba")
library(irlba)
library(ashr)
dd2 <- lfcShrink(dds, contrast=contrast,res=dd1,type = "ashr")
plotMA(dd2, ylim=c(-2,2))
#差异分析的结果
summary(dd2, alpha = 0.05)
library(dplyr)
library(tibble)
res <- dd2 %>%
data.frame() %>%
rownames_to_column("gene_id")
#ID转换
library(AnnotationDbi)
library(org.Hs.eg.db)
library(tidyr)
res <- res %>%
tidyr::separate(gene_id,into = c("gene_id"),sep="\\.")
test <- head(res)
test <- tail(res)
res$symbol <- mapIds(org.Hs.eg.db,
keys=res$gene_id,
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
res$entrez <- mapIds(org.Hs.eg.db,
keys=res$gene_id,
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
[FPKM、RPKM都是什么][1] [1]:https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html