GEO-2

～神威

屈服了，还是GEOquery包方便

#载入数据
library(GEOquery)
gset = getGEO('GSE31718', destdir=".",getGPL = F)
gset=gset[[1]]
pdata=pData(gset)
data=exprs(gset)
#加载包
library(dplyr)
library(tidyr)
library(tibble)
library(limma)
#log2转化
ex <- data
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) {
  ex[which(ex <= 0)] <- NaN
  data <- log2(ex)
  print("log2 transform finished")
}else{
  print("log2 transform not needed")
}
#标准化
boxplot(data,outline=FALSE, notch=T,las=2)
data=normalizeBetweenArrays(data)
boxplot(data,outline=FALSE, notch=T, las=2)
class(data)
data1 <- as.data.frame(data)
#探针合并
GPL6254_anno <-data.table::fread("data/GSE31718_family.soft",skip ="ID")
GPL6254_anno <- GPL6254_anno %>%
  select(ID,GENE_SYMBOL)
data1 <- data1%>%
  rownames_to_column(var="ID") %>%
  #合并探针的信息
  inner_join(GPL6254_anno,by="ID") %>%
  #去掉多余信息
  select(-ID) %>%
  #重新排列
  select(GENE_SYMBOL,everything()) %>%
  #求出平均数(这边的点号代表上一步产出的数据)
  mutate(rowMean =rowMeans(.[,-1])) %>%
  #去除symbol中的NA
  filter(GENE_SYMBOL != "NA") %>%

  #把表达量的平均值按从大到小排序
  arrange(desc(rowMean)) %>%
  # symbol留下第一个
  distinct(GENE_SYMBOL,.keep_all = T) %>%
  #反向选择去除rowMean这一列
  select(-rowMean) %>%
  # 列名变成行名
  column_to_rownames(var = "GENE_SYMBOL")
#重新排序
data2 <- select(data1,1,3,5,7,9,11,13,15,17,19,2,4,6,8,10,12,14,16,18,20)
#构建表达矩阵
group <- c(rep('irritative',10),rep('epileptogenic',10))
group <- factor(group,levels = c("irritative","epileptogenic"),ordered = F)
pairinfo = factor(rep(1:10,2))
design <- model.matrix(~ pairinfo+group)

#线性模型拟合
fit <- lmFit(data2,design)
#贝叶斯检验
fit2 <- eBayes(fit)
#输出差异基因
allDiff=topTable(fit2,adjust='BH',coef=2,p.value=0.01,number=Inf)
#画火山图
library(pheatmap)
## 设定差异基因阈值，减少差异基因用于提取表达矩阵
allDiff_pair=topTable(fit2,adjust='BH',coef=2,number=Inf,p.value=0.05,lfc =2)
##提前部分数据用作热图绘制
heatdata <- data2[rownames(allDiff_pair),]
#去除NA值
#library(IDPmisc)
#heatdata <- NaRV.omit(heatdata)
##制作一个分组信息用于注释
annotation_col <- data.frame(group)
rownames(annotation_col) <- colnames(heatdata)
#如果注释出界，可以通过调整格子比例和字体修正




pheatmap(heatdata, #热图的数据
         cluster_rows = T,#行聚类
         cluster_cols = F,#列聚类，可以看出样本之间的区分度
         annotation_col =annotation_col, #标注样本分类
         annotation_legend=TRUE, # 显示注释
         show_rownames = F,# 显示行名
         show_colnames = T,# 显示列名
         scale = "row", #以行来标准化，这个功能很不错
         color =colorRampPalette(c("blue", "white","red"))(100))

#火山图
library(ggplot2)
library(ggrepel)
library(dplyr)

data <- topTable(fit2,adjust='BH',coef=2,number=Inf)
data$significant <- as.factor(data$adj.P.Val<0.05 & abs(data$logFC) > 0.5)
data$gene <- rownames(data)
ggplot(data=data, aes(x=logFC, y =-log10(adj.P.Val),color=significant)) +
  geom_point(alpha=0.8, size=1.2,col="black")+
  geom_point(data=subset(data, logFC > 0.5),alpha=0.8, size=1.2,col="red")+
  geom_point(data=subset(data, logFC < -0.5),alpha=0.6, size=1.2,col="blue")+
  labs(x="log2 (fold change)",y="-log10 (adj.P.Val)")+
  theme(plot.title = element_text(hjust = 0.4))+
  geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
  geom_vline(xintercept = c(0.5,-0.5),lty=4,lwd=0.6,alpha=0.8)+
  theme_bw()+
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),   
        axis.line = element_line(colour = "black")) +
  geom_point(data=subset(data, abs(logFC) > 1),alpha=0.8, size=3,col="green")+
  geom_text_repel(data=subset(data, abs(logFC) > 1),
                  aes(label=gene),col="black",alpha = 0.8)