~神威
屈服了,还是GEOquery包方便
#载入数据
library(GEOquery)
gset = getGEO('GSE31718', destdir=".",getGPL = F)
gset=gset[[1]]
pdata=pData(gset)
data=exprs(gset)
#加载包
library(dplyr)
library(tidyr)
library(tibble)
library(limma)
#log2转化
ex <- data
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) {
ex[which(ex <= 0)] <- NaN
data <- log2(ex)
print("log2 transform finished")
}else{
print("log2 transform not needed")
}
#标准化
boxplot(data,outline=FALSE, notch=T,las=2)
data=normalizeBetweenArrays(data)
boxplot(data,outline=FALSE, notch=T, las=2)
class(data)
data1 <- as.data.frame(data)
#探针合并
GPL6254_anno <-data.table::fread("data/GSE31718_family.soft",skip ="ID")
GPL6254_anno <- GPL6254_anno %>%
select(ID,GENE_SYMBOL)
data1 <- data1%>%
rownames_to_column(var="ID") %>%
#合并探针的信息
inner_join(GPL6254_anno,by="ID") %>%
#去掉多余信息
select(-ID) %>%
#重新排列
select(GENE_SYMBOL,everything()) %>%
#求出平均数(这边的点号代表上一步产出的数据)
mutate(rowMean =rowMeans(.[,-1])) %>%
#去除symbol中的NA
filter(GENE_SYMBOL != "NA") %>%
#把表达量的平均值按从大到小排序
arrange(desc(rowMean)) %>%
# symbol留下第一个
distinct(GENE_SYMBOL,.keep_all = T) %>%
#反向选择去除rowMean这一列
select(-rowMean) %>%
# 列名变成行名
column_to_rownames(var = "GENE_SYMBOL")
#重新排序
data2 <- select(data1,1,3,5,7,9,11,13,15,17,19,2,4,6,8,10,12,14,16,18,20)
#构建表达矩阵
group <- c(rep('irritative',10),rep('epileptogenic',10))
group <- factor(group,levels = c("irritative","epileptogenic"),ordered = F)
pairinfo = factor(rep(1:10,2))
design <- model.matrix(~ pairinfo+group)
#线性模型拟合
fit <- lmFit(data2,design)
#贝叶斯检验
fit2 <- eBayes(fit)
#输出差异基因
allDiff=topTable(fit2,adjust='BH',coef=2,p.value=0.01,number=Inf)
#画火山图
library(pheatmap)
## 设定差异基因阈值,减少差异基因用于提取表达矩阵
allDiff_pair=topTable(fit2,adjust='BH',coef=2,number=Inf,p.value=0.05,lfc =2)
##提前部分数据用作热图绘制
heatdata <- data2[rownames(allDiff_pair),]
#去除NA值
#library(IDPmisc)
#heatdata <- NaRV.omit(heatdata)
##制作一个分组信息用于注释
annotation_col <- data.frame(group)
rownames(annotation_col) <- colnames(heatdata)
#如果注释出界,可以通过调整格子比例和字体修正
pheatmap(heatdata, #热图的数据
cluster_rows = T,#行聚类
cluster_cols = F,#列聚类,可以看出样本之间的区分度
annotation_col =annotation_col, #标注样本分类
annotation_legend=TRUE, # 显示注释
show_rownames = F,# 显示行名
show_colnames = T,# 显示列名
scale = "row", #以行来标准化,这个功能很不错
color =colorRampPalette(c("blue", "white","red"))(100))
#火山图
library(ggplot2)
library(ggrepel)
library(dplyr)
data <- topTable(fit2,adjust='BH',coef=2,number=Inf)
data$significant <- as.factor(data$adj.P.Val<0.05 & abs(data$logFC) > 0.5)
data$gene <- rownames(data)
ggplot(data=data, aes(x=logFC, y =-log10(adj.P.Val),color=significant)) +
geom_point(alpha=0.8, size=1.2,col="black")+
geom_point(data=subset(data, logFC > 0.5),alpha=0.8, size=1.2,col="red")+
geom_point(data=subset(data, logFC < -0.5),alpha=0.6, size=1.2,col="blue")+
labs(x="log2 (fold change)",y="-log10 (adj.P.Val)")+
theme(plot.title = element_text(hjust = 0.4))+
geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
geom_vline(xintercept = c(0.5,-0.5),lty=4,lwd=0.6,alpha=0.8)+
theme_bw()+
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")) +
geom_point(data=subset(data, abs(logFC) > 1),alpha=0.8, size=3,col="green")+
geom_text_repel(data=subset(data, abs(logFC) > 1),
aes(label=gene),col="black",alpha = 0.8)