看我~带土~神威
1、从UCSC下载数据然后读取,用data.table::fread 最好啦
#读取数据
data <- data.table::fread("data/tcga_RSEM_gene_tpm.gz")
geneid <- data.table::fread("data/gencode.v23.annotation.gtf.gz")
clin <- data.table::fread("data/Survival_SupplementalTable_S1_20171025_xena_sp")
jiba <-data.table::fread("data/probeMap_gencode.v23.annotation.gene.probemap")
2、加载包
library(dplyr)
library(tibble)
library(tidyr)
3、修剪一下探针信息表
jiba <- jiba[,1:2]
colnames(data)[1] <- 'id'
test <- data1[1:10,1:10]
4、合并探针信息
data1 <- data %>%
inner_join(jiba,by="id") %>%
## 去掉多余信息
select(-id) %>%
## 重新排列
select(gene,everything()) #%>%
5、去重
data1 <- data1 %>%
mutate(rowMean =rowMeans(.[,-1])) %>%
## 把表达量的平均值按从大到小排序
arrange(desc(rowMean)) %>%
## 去重,symbol留下第一个
distinct(gene,.keep_all = T) %>%
## 反向选择去除rowMean这一列
select(-rowMean) %>%
column_to_rownames("gene")
6、转制
data1 <- as.data.frame(t(data1))
7、修剪临床信息表
metadata <- data.frame(clin[,1])
metadata <- metadata %>%
mutate(group=sample) %>%
separate(group,sep = "-",into = c("q1","q2","q3","subtype"))%>%
select("subtype")
clin2 <- cbind(clin,metadata)
clin2 <- select(clin2,1, 3, 35)
clin2 <- filter(clin2,subtype=="01")
7、与临床信息合并
colnames(clin2)[2] <- 'type'
data1 <- data1 %>%
## 行名转列名,因为只有变成数据框的列,才可以用inner_join
rownames_to_column("sample") %>%
## 合并探针的信息
inner_join(clin2,by="sample")
8、定义函数
###批量计算各个癌症中的相关性系数和p值
pancor <- function(gene1,gene2,data){
data1 <- split(data,data$type)
do.call(rbind,lapply(data1, function(x){
dd <- cor.test(as.numeric(x[,gene1]),as.numeric(x[,gene2]),type="pearson")
data.frame(type=x$type[1],cor=dd$estimate,p.value=dd$p.value )
}))
}
9、画图
plotdf <- pancor("HIF1A","P4HA2",data1)
library(ggplot2)
options(scipen = 2)
ggplot(plotdf,aes(-log10(p.value),cor))+
## 画点
geom_point(size=6,fill="purple",shape = 21,colour="black",stroke = 2)+
## 改变y轴
scale_y_continuous(expand = c(0,0),limits = c(-1,1),breaks = seq(-1,1,0.2))+
## 改变x轴
scale_x_log10(expand = c(0,0),limits = c(0.1, 1000),breaks = c(0.1,10,1000))+
## 加横线
geom_hline(yintercept = 0,size=1.5)+
## 加竖线
geom_vline(xintercept = -log10(0.5),size=1.5)+
## 坐标轴
labs(x=bquote(-log[10]~italic("P")),y="Pearson correlation (r)")+
theme(axis.title=element_text(size=20),
axis.text = element_text(face = "bold",size = 16),
axis.ticks.length=unit(.4, "cm"),
axis.ticks = element_line(colour = "black", size = 1),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1.5),
plot.margin = margin(1, 1, 1, 1, "cm"))