首页 > 编程知识 正文

edger rnaseq差异分析,rnaseq数据

时间:2023-05-06 19:04:13 阅读:269232 作者:101

文章目录 1. 加载R包和输入数据2. 表达数据整理3.edgeR包做差异表达4.limma包做差异表达5.DESeq2包做差异表达6.比较三种包差异表达基因筛选结果总结:

1. 加载R包和输入数据 rm(list = ls())library("DESeq2")library("limma")library("edgeR")expr = read.csv("mRNA_exprSet.csv",sep = ',',header=T) head(expr) TCGA-mRNA数据链接
链接:https://pan.baidu.com/s/1b6l4qg40NjDNCkyiEow4aA
提取码:7zii 2. 表达数据整理 对重复基因名取平均表达量,然后将基因名作为行名 expr = avereps(expr[,-1],ID = expr$X) # 自定义 去除低表达的基因 expr = expr[rowMeans(expr)>1,] # 自定义 表达矩阵分组(癌症组织和癌旁组织) library(stringr)tumor <- colnames(expr)[as.integer(substr(colnames(expr),14,15)) < 10]normal <- colnames(expr)[as.integer(substr(colnames(expr),14,15)) >= 10]tumor_sample <- expr[,tumor]normal_sample <- expr[,normal]exprSet_by_group <- cbind(tumor_sample,normal_sample)group_list <- c(rep('tumor',ncol(tumor_sample)),rep('normal',ncol(normal_sample)))save(exprSet_by_group, group_list, file = 'exprSet_by_group_list.Rdata') 3.edgeR包做差异表达 表达矩阵 data = exprSet_by_group 分组矩阵 group_list = factor(group_list)design <- model.matrix(~0+group_list)rownames(design) = colnames(data)colnames(design) <- levels(group_list) 差异表达矩阵 DGElist <- DGEList( counts = data, group = group_list)## Counts per Million or Reads per Kilobase per Millionkeep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2 ## 自定义table(keep_gene)DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]DGElist <- calcNormFactors( DGElist )DGElist <- estimateGLMCommonDisp(DGElist, design)DGElist <- estimateGLMTrendedDisp(DGElist, design)DGElist <- estimateGLMTagwiseDisp(DGElist, design)fit <- glmFit(DGElist, design)results <- glmLRT(fit, contrast = c(-1, 1)) nrDEG_edgeR <- topTags(results, n = nrow(DGElist))nrDEG_edgeR <- as.data.frame(nrDEG_edgeR)head(nrDEG_edgeR) 提取基因差异显著的差异矩阵 padj = 0.01 # 自定义foldChange= 2 # 自定义nrDEG_edgeR_signif = nrDEG_edgeR[(nrDEG_edgeR$FDR < padj & (nrDEG_edgeR$logFC>foldChange | nrDEG_edgeR$logFC<(-foldChange))),]nrDEG_edgeR_signif = nrDEG_edgeR_signif[order(nrDEG_edgeR_signif$logFC),]save(nrDEG_edgeR_signif,file = 'nrDEG_edgeR_signif.Rdata') 4.limma包做差异表达 表达矩阵 data = exprSet_by_group 分组矩阵 group_list = factor(group_list)design <- model.matrix(~0+group_list)rownames(design) = colnames(data)colnames(design) <- levels(group_list) 差异表达矩阵 DGElist <- DGEList( counts = data, group = group_list )keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2 # 自定义table(keep_gene)DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]DGElist <- calcNormFactors( DGElist )v <- voom(DGElist, design, plot = TRUE, normalize = "quantile")fit <- lmFit(v, design)cont.matrix <- makeContrasts(contrasts = c('tumor-normal'), levels = design)fit2 <- contrasts.fit(fit, cont.matrix)fit2 <- eBayes(fit2)nrDEG_limma_voom = topTable(fit2, coef = 'tumor-normal', n = Inf)nrDEG_limma_voom = na.omit(nrDEG_limma_voom)head(nrDEG_limma_voom) 提取基因差异显著的差异矩阵 padj = 0.01 # 自定义foldChange= 2 # 自定义nrDEG_limma_voom_signif = nrDEG_limma_voom[(nrDEG_limma_voom$adj.P.Val < padj & (nrDEG_limma_voom$logFC>foldChange | nrDEG_limma_voom$logFC<(-foldChange))),]nrDEG_limma_voom_signif = nrDEG_limma_voom_signif[order(nrDEG_limma_voom_signif$logFC),]save(nrDEG_limma_voom_signif, file = 'nrDEG_limma_voom_signif') 5.DESeq2包做差异表达 表达矩阵 data = exprSet_by_group 分组矩阵 condition = factor(group_list)coldata <- data.frame(row.names = colnames(data), condition)dds <- DESeqDataSetFromMatrix(countData = data, colData = coldata, design = ~condition)dds$condition<- relevel(dds$condition, ref = "normal") # 指定哪一组作为对照组 差异表达矩阵 dds <- DESeq(dds) allDEG2 <- as.data.frame(results(dds)) 提取基因差异显著的差异矩阵 padj = 0.01 # 自定义foldChange= 2 # 自定义nrDEG_DESeq2_signif = allDEG2[(allDEG2$padj < padj & (allDEG2$log2FoldChange>foldChange | allDEG2$log2FoldChange<(-foldChange))),]nrDEG_DESeq2_signif = nrDEG_DESeq2_signif[order(nrDEG_DESeq2_signif$log2FoldChange),]save(nrDEG_DESeq2_signif, file = 'nrDEG_DESeq2_signif') 6.比较三种包差异表达基因筛选结果 edgeR = rownames(nrDEG_edgeR_signif)dim(nrDEG_edgeR_signif)limma = rownames(nrDEG_limma_voom_signif)dim(nrDEG_limma_voom_signif)DESeq2 = rownames(nrDEG_DESeq2_signif)dim(nrDEG_DESeq2_signif) library(VennDiagram)venn.diagram( x = list( 'edgeR(2675)' = edgeR, 'limma(2618)' = limma, 'DESeq2(2913)' = DESeq2 ), filename = 'VN.png', col = "black", fill = c("dodgerblue", "goldenrod1", "darkorange1"), alpha = 0.5, cex = 0.8, cat.col = 'black', cat.cex = 0.8, cat.fontface = "bold", margin = 0.05, main = "三种包的差异表达基因比较", main.cex = 1.2)

总结:

三种包共同筛选出的差异表达基因有1989个,总体来说筛选效果相差不是很大。

本博客内容将同步更新到个人微信公众号生信玩家。欢迎大家关注~~~

版权声明:该文观点仅代表作者本人。处理文章:请发送邮件至 三1五14八八95#扣扣.com 举报,一经查实,本站将立刻删除。