BIOINFORMATICS LAB Episode VI Gene Networks with R · BIOINFORMATICS LAB Episode VI ... X3 Level....
Transcript of BIOINFORMATICS LAB Episode VI Gene Networks with R · BIOINFORMATICS LAB Episode VI ... X3 Level....
BIOINFORMATICS LAB
Episode VI – Gene Networks
with R
Federico M. Giorgi, PhDChiara Cabrelle, TA
Department of Pharmacy and Biotechnology
First Cycle Degree in Genomics
2/80
R Intro Recap (1)
# Vector
length(vector)
names(vector)
# Matrix
nrow(matrix)
ncol(matrix)
dim(matrix)
rownames(matrix)
colnames(matrix)
# Concatenation
longer_vector<-c(vector,vector)
matrix<-cbind(vector1,vector2) # Column Bindlarge_matrix<-rbind(matrix1,matrix2) # Row Bind
large_matrix<-cbind(matrix1,matrix2) # Column Bind
Object types
3/80
R Intro Recap (1)
# Vector
length(vector)
names(vector)
# Matrix
nrow(matrix)
ncol(matrix)
dim(matrix)
rownames(matrix)
colnames(matrix)
# Concatenation
longer_vector<-c(vector,vector)
matrix<-cbind(vector1,vector2) # Column Bind
large_matrix<-rbind(matrix1,matrix2) # Row Bindlarge_matrix<-cbind(matrix1,matrix2) # Column Bind
Object types
4/80
R Intro Recap (2)
# Structure
function_name<-function(param1,param2,param3){
result<-operation(param1,param2)
result<-result+param3
return(result)
}
Custom Functions
# Install Libraries (only once)
source("https://bioconductor.org/biocLite.R")
biocLite("enrichplot")
# Load Libraries (Every time you restart R)
library("enrichplot")
# Help
?fisher.test # Guide for a function
??fish # Fuzzy search
browseVignettes("enrichplot") # Full manual (for good libraries)
Convenience Commands
5/80
R Intro Recap (3)
# While
i<-0
while(i<10){
i<-i+1
message(i)
}
# For
for(i in 1:10){
message(i)
}
# The apply trick
X<-matrix(rnorm(5*4),nrow=5,ncol=4)
apply(X,1,sum)
apply(X,2,sum)
# Nested For Loop
for(i in 1:nrow(X)){
for(j in 1:ncol(X)){
message(i," and ",j)
}
}
Loops
6/80
R Intro Recap (4)
# Read Text files
input<-read.csv("file.csv",as.is=TRUE)
input<-read.delim("file.txt",as.is=TRUE,sep="\t")
# Write Text files
write.csv(object,file="output.csv")
write.table(object,file="output.txt")
# Import Rdata files
load("input.rda")
# Save Rdata files
save(object1,object2,"input.rda")
Reading/Writing from files
7/80
# Scatterplot or XY plot
x<-rnorm(100)
y<-rnorm(100,mean=1)+x
plot(x,y) # Simple one
# Plot Beautification
plot(x,y,pch=19,col="navy",main="Scatterplot",xlab="Independent Variable",ylab="Dependent Variable")
grid()
mtext("Subtitle")
text(1,-3,"A word")
lm1<-lm(y~x) # A regression line
abline(lm1,lwd=3,col="red")
R Intro Recap (5)Scatterplots: Relationships between Two Variables
8/80
# Boxplot
boxplot(x,y,2*y,col=c("green","orange","red"),main="Boxplot",names=c("X","Y","2Y"))
R Intro Recap (6)Boxplots: comparing multiple (1-10) variables
Median
9/80
# Boxplot
boxplot(x,y,2*y,col=c("green","orange","red"),main="Boxplot",names=c("X","Y","2Y"))
R Intro Recap (6)Boxplots: comparing multiple (1-10) variables
Default R Boxplots Features
Median
1st Quartile
3rd Quartile
InterQuartile Range (IQR)
Upper Whisker (falls on the closest point at 3rdQ + 1.5IQR)
Lower Whisker (falls on the closest point at 1stQ + 1.5IQR)
Outliers
10/80
# Matrix with 4 correlated variables, 30 observations (or samples)
x1<-rnorm(30)
x2<-x1+rnorm(30,sd=0.1)
x3<-rnorm(30,sd=0.1)
x4<-rnorm(30)-x1
matrix<-cbind(x1,x2,x3,x4)
# Matplot
matplot(matrix,type="l",lwd=2,main="Matplot")
legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)
Visualizing Gene Expression ProfilesMatplots: comparing multiple variables
Ge
ne
Exp
ressio
n L
eve
l
11/80
# Matrix with 4 correlated variables, 30 observations (or samples)
x1<-rnorm(30)
x2<-x1+rnorm(30,sd=0.1)
x3<-rnorm(30,sd=0.1)
x4<-rnorm(30)-x1
matrix<-cbind(x1,x2,x3,x4)
# Matplot
matplot(matrix,type="l",lwd=2,main="Matplot")
legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)
Visualizing Gene Expression ProfilesMatplots: comparing multiple variables
X1
X2
X4
X3
Ge
ne
Exp
ressio
n L
eve
l
12/80
# Matrix with 4 correlated variables, 30 observations (or samples)
x1<-rnorm(30)
x2<-x1+rnorm(30,sd=0.1)
x3<-rnorm(30,sd=0.1)
x4<-rnorm(30)-x1
matrix<-cbind(x1,x2,x3,x4)
# Matplot
matplot(matrix,type="l",lwd=2,main="Matplot")
legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)
Visualizing Gene Expression ProfilesMatplots: comparing multiple variables
X1
X2
X4
X3
+
Ge
ne
Exp
ressio
n L
eve
l
13/80
# Matrix with 4 correlated variables, 30 observations (or samples)
x1<-rnorm(30)
x2<-x1+rnorm(30,sd=0.1)
x3<-rnorm(30,sd=0.1)
x4<-rnorm(30)-x1
matrix<-cbind(x1,x2,x3,x4)
# Matplot
matplot(matrix,type="l",lwd=2,main="Matplot")
legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)
Visualizing Gene Expression ProfilesMatplots: comparing multiple variables
X1
X2
X4
X3
-+
Ge
ne
Exp
ressio
n L
eve
l
14/80
# Matrix with 4 correlated variables, 30 observations (or samples)
x1<-rnorm(30)
x2<-x1+rnorm(30,sd=0.1)
x3<-rnorm(30,sd=0.1)
x4<-rnorm(30)-x1
matrix<-cbind(x1,x2,x3,x4)
# Matplot
matplot(matrix,type="l",lwd=2,main="Matplot")
legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)
Visualizing Gene Expression ProfilesMatplots: comparing multiple variables
X1
X2
X4
X3
Ge
ne
Exp
ressio
n L
eve
l
15/80
# Matrix with 4 correlated variables, 30 observations (or samples)
x1<-rnorm(30)
x2<-x1+rnorm(30,sd=0.1)
x3<-rnorm(30,sd=0.1)
x4<-rnorm(30)-x1
matrix<-cbind(x1,x2,x3,x4)
# Matplot
matplot(matrix,type="l",lwd=2,main="Matplot")
legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)
Visualizing Gene Expression ProfilesMatplots: comparing multiple variables
X1
X2
X4
X3
Ge
ne
Exp
ressio
n L
eve
l
16/80
# Matrix with 4 correlated variables, 30 observations (or samples)
x1<-rnorm(30)
x2<-x1+rnorm(30,sd=0.1)
x3<-rnorm(30,sd=0.1)
x4<-rnorm(30)-x1
matrix<-cbind(x1,x2,x3,x4)
# Matplot
matplot(matrix,type="l",lwd=2,main="Matplot")
legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)
Visualizing Gene Expression ProfilesMatplots: comparing multiple variables
X1
X2
X4
X3
Ge
ne
Exp
ressio
n L
eve
l
17/80
# Matrix with 4 correlated variables, 30 observations (or samples)
x1<-rnorm(30)
x2<-x1+rnorm(30,sd=0.1)
x3<-rnorm(30,sd=0.1)
x4<-rnorm(30)-x1
matrix<-cbind(x1,x2,x3,x4)
# Matplot
matplot(matrix,type="l",lwd=2,main="Matplot")
legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)
Visualizing Gene Expression ProfilesMatplots: comparing multiple variables
X1
X2
X4
X3
Ge
ne
Exp
ressio
n L
eve
l
18/80
# Matrix with 4 correlated variables, 30 observations (or samples)
x1<-rnorm(30)
x2<-x1+rnorm(30,sd=0.1)
x3<-rnorm(30,sd=0.1)
x4<-rnorm(30)-x1
matrix<-cbind(x1,x2,x3,x4)
# Matplot
matplot(matrix,type="l",lwd=2,main="Matplot")
legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)
Visualizing Gene Expression ProfilesMatplots: comparing multiple variables
X1
X2
X4
X3
Ge
ne
Exp
ressio
n L
eve
l
19/80
# Matrix with 4 correlated variables, 30 observations (or samples)
x1<-rnorm(30)
x2<-x1+rnorm(30,sd=0.1)
x3<-rnorm(30,sd=0.1)
x4<-rnorm(30)-x1
matrix<-cbind(x1,x2,x3,x4)
# Matplot
matplot(matrix,type="l",lwd=2,main="Matplot")
legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)
Visualizing Gene Expression ProfilesMatplots: comparing multiple variables
X1
X2
X4
X3
Ge
ne
Exp
ressio
n L
eve
l
20/80
Transcriptional Regulation
DNA
21/80
Transcriptional Regulation
DNAGene 1 Gene 2 Gene 3 Gene 21000
22/80
Transcriptional Regulation
DNAGene 1 Gene 2 Gene 3 Gene 21000
Gene Promoters
23/80
Transcriptional Regulation
DNAGene 1 Gene 2 Gene 3 Gene 21000
TFTranscription
Factor
(Activated)
TF1
24/80
Transcriptional Regulation
DNAGene 1 Gene 2 Gene 3 Gene 21000
TF1 TF2
25/80
Transcriptional Regulation
DNAGene 1 Gene 2 Gene 3 Gene 21000
TF1 TF2
Gene 1 RNA Gene 3 RNA
26/80
Transcriptional Regulation
DNAGene 1 Gene 2 Gene 3 Gene 21000
TF1 TF2
Gene 1 RNA Gene 3 RNA
Gene 1 Protein Gene 3 Protein
27/80
Transcriptional Regulation
DNAGene 1 Gene 2 Gene 3 Gene 21000
TF1 TF2
Gene 1 RNA Gene 3 RNA
Gene 1 Protein Gene 3 ProteinEffect
(e.g. Proliferation)
28/80
Transcriptional Regulation
DNAGene 1 Gene 2 Gene 3 Gene 21000
TF1 TF2
Gene 1 RNA Gene 3 RNA
Gene 1 Protein Gene 3 ProteinEffect
(e.g. Proliferation)
Gene 1 Gene 3TF1 TF2
29/80
Gene Regulatory Networks
Network Representation of
Transcriptional Regulation in Nonnocloropsis oceanica
(Hu et al., 2014, PMID 24965723)
Yellow: Transcription Factors (TFs)
Pink: Target Genes (TGs)
30/80
Gene Regulatory Networks
Network Representation of
Transcriptional Regulation in Nonnocloropsis oceanica
(Hu et al., 2014, PMID 24965723)
Yellow: Transcription Factors (TFs)
Pink: Target Genes (TGs)
31/80
Gene Regulatory Networks
DataGene Regulatory Network
32/80
Gene Regulatory Networks
Gene Regulatory Network Data
Network Reverse
Engineering
33/80
Gene Regulatory Networks
DataGene Regulatory Network
Network Reverse
Engineering
TF
μRNA
TG
Nodes:
Activation
Edge
Repression
Edge
C E
N N
Undirected Edge
Directed Edge
Node A
Node B
Node C
34/80
Gene Regulatory Networks
DataGene Regulatory Network
Network Reverse
Engineering
TF
μRNA
TG
Nodes:
Activation
Edge
Repression
Edge
C E
N N
Undirected Edge
Directed Edge
Node A
Node B
Node C
35/80
• Coexpression-Based
• Motif-Based
• ChIP-Seq-Based
• Orthology-Based
• Literature-Based
• Integrated
Methods to Reverse-Engineer Gene Regulatory Networks
36/80
Network Reverse Engineering via Orthology
Figure by Laura Scalambra
Inference of Orthology
TF
TF
TF
TF
TF
TG
TF
TG
Species 1
Species 2
Phylogenetic Analysis
sequence conservation
pattern analysis
GRN Edge Prediction
TG TG
T
G
Species 1
Species 2
TF TG
TF
TF TG
TF TG
TG ?
Experimental
ValidationNew
Information
TF Homo sapiens
TF Mus Musculus
TF Bos Taurus
TF Danio Rerio
Orthology Relationship
Regulatory Relationship
(Experimentally Validated)
A B C
TG
? TG ?
Regulatory Relationship
(Hypothesized)
37/80
Network Reverse Engineering via Orthology
Figure by Laura Scalambra
38/80
Network Reverse Engineering via Co-expression
Gene A Gene B
Common causee.g. Cellulose Synthase (CESA) complex
Gene A
Gene B
Gene C
Cause-Effect
Gene C
CESA2/5/6/9
CESA2/5/6/9
CESA1
CESA3
CESA1
CESA1
AP2
e.g. UDP-L-Rhamnose synthesis
GL2 RHM2
TF TF enzyme
39/80
Example of Coexpressed GenesCellulose Synthase Complex (CESAs) genes
Samples
Matplot Scatterplot
40/80
Transcriptome-wide Reverse Engineering
Evaluate all gene pairs associations
Coexpression-basedReverse Engineering
methods
41/80
Coexpression Evaluation Methods
1. Correlation(Pearson, Spearman, ...)
2. Mutual Information3. Linear Regression4. ...
General methods
Gene A
Conditional methods
1. Partial Correlation2. Partial Mutual Information3. LASSO regression4. Data Processing Inequality5. ...
Gene B Gene C
Gene A
Gene B
Gene C
Gene C
Gene A
Gene B
42/80
• Synonims: PCC, rxy, Product Moment Correlation Coefficient,
"Correlation"
• It calculates the linear dependency between two variables
• It Ranges from -1 to 1
Pearson Correlation Coefficient
1 0 -1-0.70.7
43/80
Pearson Correlation Coefficient: Visual Examples
44/80
Pearson Correlation Coefficient: Visual Examples
45/80
Pearson Correlation Coefficient: the Formula
Using as numeric variables Gene Expression Profiles (GEPs)
Two variables (GEPs for two genes): x and y
Number of samples (observations): n
The mean of the x gene expression across samples
The mean of the y gene expression across samples
46/80
How to calculate Pearson Correlation in R
► You could of course write a Correlation function, but the one in R is highly optimized
(also for all-vs-all when lots of variables are present, e.g. in Transcriptomics)
► Let's generate two random (but correlated) variables. Last time before real data.
► In this example, x is independent, and y depends on x plus some noise
x<-rnorm(1000)
y<-x+rnorm(1000,sd=1)
plot(x,y,cex=2,pch=20)
cor(x,y) # r=0.72
► The entity of the noise determines the PCC (in this case, being gaussianly-distributed
data, the SD)
x<-rnorm(1000)
y<-x+rnorm(1000,sd=0.1)
plot(x,y,pch=20)
cor(x,y) # r=0.99
x<-rnorm(1000)
y<-x+rnorm(1000,sd=2)
plot(x,y,pch=20)
cor(x,y) # r=0.41
47/80
Pearson Correlation Coefficient: Significance
► We Observe a Correlation. Can we say from the r value if it's: high, low or significant?
► Short Answer: NO
► The significant also depends on the number of observations. The more, the more robust
the PCC and the higher the significance
► p-value: Chance to obtain the same PCC (or higher) with random data of identical
size (nr. of samples)
► The lower the p-value, the higher the significance
x<-rnorm(1000)
y<-x+rnorm(1000,sd=10)
cor.test(x,y) # r=0.44, p<7.5x10-8
par(mfrow=c(2,3))
for(n in c(10,20,50,100,500,1000)){
x<-rnorm(n)
y<-x+rnorm(n,sd=1)
r<-signif(cor(x,y),4)
p<-signif(cor.test(x,y)$p.value,4)
title<-paste0("n=",n," r=",r," p=",p)
plot(x,y,main=title,pch=20)
}
48/80
The Estrogen Receptor (ER)
ER
• ER Drives Proliferation in
endometrium, breast,
ovarian and
hypothalamus
• Activator of Transcription
(occasionally repressor)
• Driver of
• Recognizes Estrogen
Responsive Elements in
promoters of TGs
49/80
The Tumor Master Regulator Hypothesis
Effector
genes(Whole genome)
(epi)Genom
ic events(Mutations, CNVs,
methylations)
TF TFTF
Master
Regulators(Transcription Factors)
Proliferation Migration
50/80
The Tumor Master Regulator Hypothesis
Effector
genes(Whole genome)
(epi)Genom
ic events(Mutations, CNVs,
methylations)
TF TFTF
Master
Regulators(Transcription Factors)
Proliferation Migration
Glioblastoma:
Carro MS et al. Nature. 2010 Jan 21;463(7279):318-
25.
Chen J et al, Nature. 2014 159(2): 402-14.
Tumor Checkpoint: CEBPb/d and Stat3
Alteration KLHL9
Diffuse Large B Cell Lymphoma:
Compagno M et al. Nature. 2009 Jun
4;459(7247):717-21
Tumor Checkpoint: Nf-kB
Alterations: CARD11, A20, …
GC-Resistance in T-ALL:
Real PJ et al. Cancer Cell. 2013 Dec 9;24(6)
Tumor Checkpoint: NOTCH1/Akt1 pathways
Alterations: Pten, Pi3k
T-ALL Tumorigenesis:
Real PJ et al. Nat Med. 2009 Jan;15(1):50-8.
Dalla Gatta G et al. Nature Medicine, 2012 Feb
26;18(3)
Tumor Checkpoint: TLX1, TLX3, RUNX1
Alterations: TLX1, TLX3, RUNX1
Malignant Prostate Cancer:
Aytes et al. Cancer Cell 25, 638-651 (2014)
Tumor Checkpoint: FOXM1 and CENPF
Follicular Lymphoma progression to DLBCL
• Bisikirska B. et al., Cancer Research in press (2015)
• Tumor Checkpoint: FOXM1, TFDP1, ATF5,
HMGA1, NFYB
TFs Master Regulators of
Luminal Breast Cancer:
ER + GATA3 + FOXA1(Fletcher et al., Nat Commun. 2013)
51/80
Correlation Requires Data
A dataset: the TCGA Breast Cancer RNA-Seq dataset
~1200 patients
~20k genes
Gene
Expression
Values
R object:
• An Expression Matrix• expmat
Fields:
• rownames(expmat): Gene Symbols
• colnames(expmat): TCGA Sample IDs
52/80
Exercises! A simple Pearson Correlation Network1. Get the second biggest Breast Cancer Gene Expression Dataset ever generated (~2mins download):
load(url("https://www.dropbox.com/s/nrxrsq8m0gfjwqh/tcga_BRCA-expmat.rda?dl=1"))
2. Describe it (number of genes, number of samples)
– Always visualize a slice of a big dataset to familiarize with dimension names and data format, e.g. with expmat[1:5,1:5]
– Plot a single gene as a sanity check, e.g. plot(expmat["EGFR",],type="line")
3. Find the 100 genes most correlated (Pearson) with ER: the Estrogen Receptor 1 TF
(use the Genecards website to find the ER gene name). Plot some ER-gene scatterplot. Check the
correlation between ER and some of its known interactors: FOXA1, GATA3.
53/80
Exercises! A simple Pearson Correlation Network4. Compare the top 100 ER-positively correlated TGs in your network with the genes controlled by ER and defined
experimentally by the MsigDB Pathway Database. Is the overlap greater than expected by chance?
– Hint1: click on Show Members
– Hint2: use Notepad++ and read.delim()
– Remember fisher.test(ctable,alternative="greater")ER Targets
Experimentally
Known
(MsigDB)
ER
Positive
Correlators
If you cannot access the website, the
database ER pathway genes are here
54/80
Exercises! A simple Pearson Correlation Network
5. Define a Hypothetical Gene Network Centered around ER
Distinguishing:Positively correlated (TGs activated by ER)
Negatively correlated (TGs repressed by ER)
Bonus points: use the igraph library to draw a network of
positively correlated ER targets with edges thickness proportional
to the correlation coefficient
A common table representation for a network is this:
TF TG Weight
ESR1 AGR2 0.xxx
ESR1 MAPT 0.xyx
ESR1 ARSG 0.yxx
ESR1 AFF3 0.xzx
ESR1 THSD4 0.zxy
Where TF is a Transcription Factor, TG is a Target Gene and
Weight in this case is the strength of the correlation
55/80
Technical Details and Tips1. The dataset contains normalized expression data:
1. Samples collected by The Cancer Genome Atlas initiative and analyzed in this paper
2. RNA-Seq
3. Patient-Derived
4. Breast Cancer (all subtypes)
5. Tumor tissues and Proximal Normal tissues
6. VST-normalization (raw counts is divided by sample size and then the distribution is transformed to be tested
by other tools, like DESeq2): paper here
2. If you want to do a Venn Diagram to show the overlap between coexpression and experimental
validation, try this list of nice R ways to do it
3. There are at least two ways to calculate 1-vs-all PCC in R
1. For loop (remember sort() and names()). Easy but a bit slower
2. Using cor() itself on the matrix (use ?cor and test it on small matrices): this is dangerous and will consume your RAM quickly. A common trick is to reduce the size of the expression matrix, by analyzing only the top1000
genes with the highest variance (tip: you can use a combination var(), apply(), sort() and then names() for this).
56/80
Spearman Correlation
• The Spearman Correlation Coefficient (SCC) calculates dependencies in data that
can be non-linear but monotonic
• Its trick is to rank-transform the initial GEP:
x<-c(0,1,5,30,40,50,100)
xrank<-rank(x)
plot(x,xrank,cex=2,pch=20,type="b")
• When compared to Pearson, Spearman can find
non-linear, exponential relationship:
x<-abs(runif(1000,2,10))
y<-x^10
pcc<-cor(x,y)
scc<-cor(x,y,method="spearman")
plot(x,y,pch=20)
title(paste0("PCC=",signif(pcc,4),"\nSCC=",signif(scc,4)))
57/80
Spearman Correlation
• Another great advantage of Spearman Correlation's Rank Transformation makes it
more robust to outliers than Pearson:
x<-c(rnorm(100),50)
y<-c(rnorm(100),50)
pcc<-cor(x,y)
scc<-cor(x,y,method="spearman")
plot(x,y,pch="☺")
title(paste0("PCC=",signif(pcc,4),"\n
SCC=",signif(scc,4)))
• PCC and SCC are equally accepted in scientific
papers. SCC is more refined, but a bit slower to
calculate, especially for huge analyses (millions of
features and samples)
• There is another, third and last Correlation method: Kendall Correlation (KCC)
KCC is very similar to SCC: it measures monotonic relationships. It measures the
number of co-increasing quadruplets of points, but it is slow to calculate
and nobody uses it in Science
58/80
Exercises! Pearson vs. Spearman Correlation
1. Take two random rows from the Breast Cancer expmat. Compare the output
(numerically and graphically) of cor(x,y,method="spearman") with:
– cor(rank(x),rank(y),method="pearson")
– cor(log10(x),log10(y),method="pearson")
2. Calculate all PCCs and SCCs between ESR1 and other genes
– Correlate the resulting vectors of Correlation Coefficients
– What is the ESR1 edge with the highest difference between PCC and SCC?
Purpose: find out real cases where Spearman is more robust than Pearson. This RNA-Seq dataset is
exactly the data you will get in a real job. It's not artificial. There are no perfect examples.
ESR1 ?
Edge
59/80
• Technique to remove indirect correlations
Partial Correlation
Gene X Gene Y
Gene Z0.81 0.89
0.73
Standard Correlation (zeroth order)
Partial Correlation (first order)
r_xy <- cor(x,y)
60/80
Partial Correlation
Gene X Gene Y
Gene Z0.81 0.89
0.03
Standard Correlation (zeroth order)
Partial Correlation (first order)
r_xy <- cor(x,y)
• Technique to remove indirect correlations
61/80
R example of Partial Correlation
set.seed(1)
x<-rnorm(1000)
y<-rnorm(1000)+x
z<-rnorm(1000)+y
A simple gene circuitX ZY
cor(x,y) # 0.7076544
cor(x,z) # 0.5939462
cor(y,z) # 0.828587
pcor(x,y,z) # 0.4784933
pcor(x,z,y) # 0.01919473
pcor(y,z,x) # 0.7182564
Calculate cor and partial cor (*you will need to write the pcor function!*)
X
Y
Z
62/80
R example of Partial Correlation
set.seed(1)
x<-rnorm(1000)
y<-rnorm(1000)+x
z<-rnorm(1000)+y
A simple gene circuitX ZY
cor(x,y) # 0.7076544
cor(x,z) # 0.5939462
cor(y,z) # 0.828587
pcor(x,y,z) # 0.4784933
pcor(x,z,y) # 0.01919473pcor(y,z,x) # 0.7182564
Calculate cor and partial cor (*you will need to write the pcor function!*)
X
Y
Z
63/80
Exercises! Partial Correlation
pcor<-function(x,y,z){}
load(url("https://www.dropbox.com/s/nrxrsq8m0gfjwqh/tcga_BRCA-expmat.rda?dl=1"))
1) Load the TCGA Breast Cancer Expression Matrix:
2) Write a function to calculate partial correlation:
Test your function on three genes of your choice.
To test if your result is correct, use this website to calculate pcor from standard PCCs (r=PCC)
3) Using Partial Correlation, remove the indirect edge from these small gene networks:
In blue, Target Genes
In orange: Transcription Factors
GATA3
BCL2
ESR1 GATA3
SCUBE2
RARA E2F3
ODC1
MYC
link
ZEB1
JCAD
SNAI2
64/80
Mutual Information
X
• Mutual Information is a very sophisticated method that is able to find ANY relationship
structure between two variables
• Mutual Information is a method designed for
categorical data. I.e. with finite distinct outcomes
("red", "blue", "green", etc.)
• GEPs are continuous variables with n observation
• The conversion of rank-transformed continuous to
categorical data requires a process called binning:
Y
The number of bins can vary.
In the common algorithm,
the nr. bins = the cube root of the nr. obs (n), rounded.
65/80
Mutual Information: calculation
X
• Once binning is defined, one requires to calculate the entropy (H)
of each variable (X and Y) and the entropy of the joint distribution
(x,y) with the Shannon's formula:
• Mutual Information between X and Y is defined as such:
• In other words: if the disorder of the pair X-Y is lower than the sum of the disorder in the
separated variables, then there is a dependency between X and Y
66/80
Mutual Information in Real Life
"Oh no! I dOn't uNdErStaNd wHy wE aRe stuDyiNg this!
I want to learn useful stuff that I will use in real life!"
67/80
Mutual Information in Real Life
"Oh no! I dOn't uNdErStaNd wHy wE aRe stuDyiNg this!
I want to learn useful stuff that I will use in real life!"
>50,000
citations!!!
68/80
Mutual Information vs. PCC
Pearson correlation
Mu
tua
l in
form
atio
nDevelopmental dataset (Arabidopsis)
Pearson = 0.15
M.I . = 1.04-0.5 0 0.5
0.2
0.4
0.6
0.8
1.0
1.2
69/80
Mutual Information: Practical Solution
70/80
Mutual Information: Practical Solution
71/80
Mutual Information: Practical Solution
72/80
Mutual Information: Practical Solution
73/80
Mutual Information: Practical Solution
74/80
Mutual Information: Practical Solution
75/80
Mutual Information: Practical Solution
76/80
Mutual Information: Practical Solution
77/80
Mutual Information: Practical Solution
78/80
Exercises! Mutual Information
1. (Long): Write a function to calculate Mutual Information between these two variables
(set nr bins = n^(1/3)). Tips: cut() for binning and table() for calculating occurrences.
– A correct function should give maximum self-MI: mi(x,x)=1.94
– The entropy is a sum of terms of the form p log(p). When p=0 you instead use the limiting value
(as p approaches 0 from above), which is 0
n<-300
set.seed(1)
x<-runif(n,-4,4)
y<-abs(rnorm(n)+x^3)
# Expected results:
cor(x,y,method="p") # 0.07353703
cor(x,y,method="s") # -0.01902377
mi(x,y) # 0.7567252
79/80
Exercises! Mutual Information2. (Long): repeat my Arabidopsis analysis comparing PCC vs Mutual Info in the Breast cancer dataset.
Below is how it will look like for ESR1 edges only. However, we are looking for something more
diverging, if possible. Try as many genes as you can (the challenge is to test every possible gene-
gene pair without running out of RAM). An edge that is non-linear, non-monotonic, that can
underly a non-canonical regulatory mechanism.
All ESR1 correlations (except ESR1 vs ESR1)
→ Every point in this plot is a gene pair!
80/80
Exercises! Mutual Information2. (Long): repeat my Arabidopsis analysis comparing PCC vs Mutual Info in the Breast cancer dataset.
Below is how it will look like for ESR1 edges only. However, we are looking for something more
diverging, if possible. Try as many genes as you can (the challenge is to test every possible gene-
gene pair without running out of RAM). An edge that is non-linear, non-monotonic, that can
underly a non-canonical regulatory mechanism.
?
All ESR1 correlations (except ESR1 vs ESR1)
→ Every point in this plot is a gene pair!
www.giorgilab.org
Federico M. Giorgi, PhD
Department of Pharmacy and Biotechnology
to EF, bringer of salted breakfasts
82/80
Tricks (Pearson Correlation Exercise)
##################
load(url("https://www.dropbox.com/s/nrxrsq8m0gfjwqh/tcga_BRCA-
expmat.rda?dl=1"))
### Play
plot(expmat["EGFR",],type="line")
# Gene Expression Profile (GEP)
mygep<-expmat["ESR1",]
# ## Correlation, matrix way
# # Keep only the top variance genes
# vars<-apply(expmat,1,var)
# vars<-sort(vars,decreasing=TRUE)
# topvars<-names(vars)[1:1000]
# # Correlation
# submat<-expmat[topvars,]
# cormat<-cor(t(submat))
# cors<-cormat["ESR1",]
# cor_genes<-names(sort(cors,decreasing=TRUE))[1:30]
## Correlation, for loop
cors<-c()
for(i in 1:nrow(expmat)){
newgep<-expmat[i,]
pcc<-cor(mygep,newgep)
cors<-c(cors,pcc)
}
names(cors)<-rownames(expmat)
cor_genes<-names(sort(cors,decreasing=TRUE))[1:100]
# Plot them
othergep<-cor_genes[2]
plot(mygep,expmat[othergep,],xlab="ESR1",ylab=othergep)
# Draw the network
library(igraph)
TF<-rep("ESR1",length(cor_genes))
TG<-cor_genes
Weight<-cors[cor_genes]
mydf<-cbind(TF,TG,Weight)
mydf<-as.data.frame(mydf,as.is=TRUE)
mydf[,3]
g <- graph_from_data_frame(mydf, directed=TRUE)
g <- set_edge_attr(g, "weight", value = mydf$Weight)
# ?plot.igraph
png("plots/igraph.png",w=3000,h=3000,res=300)
plot(g, edge.width = E(g)$weight/5, edge.label=E(g)$weight) # with 30 it's nicer
dev.off()
### Load the motif-based target list from MSigDB
http://software.broadinstitute.org/gsea/msigdb/cards/HALLMARK_ESTROGEN_RESPONSE_LATE
raw<-read.delim("data/06_networks/ertargets.txt",as.is=TRUE)
db_genes<-raw[,1]
## Enrichment. Expected test
ul<-length(intersect(db_genes,cor_genes))
100*ul/length(db_genes) # 6.5% of db_genes in our prediction
100*ul/length(cor_genes) # 13% of cor_genes in database
length(cor_genes)/nrow(expmat) # 0.5% of cor_genes
## Contingency table: upper-left, upper-right, down-left, down-right
ul<-intersect(db_genes,cor_genes)
ur<-length(setdiff(db_genes,ul))
dl<-length(setdiff(cor_genes,ul))
ul<-length(ul)
dr<-nrow(expmat)-ul-ur-dl # the "universe"
ctable<-rbind(c(ul,ur),c(dl,dr))
fisher.test(ctable,alternative="greater")
83/80
Tricks (Pearson vs Spearman)
############# PCC vs SCC
mygene<-"ESR1"
mygep<-expmat[mygene,]
pccs<-cor(mygep,t(expmat),method="p")
sccs<-cor(mygep,t(expmat),method="s")
plot(pccs,sccs)
# ESR1-other with the highest PCC vs. SCC difference
diffs<-abs(pccs-sccs)
names(diffs)<-rownames(expmat)
diffs<-sort(diffs,decreasing=TRUE)[1:10]
# SCG3 FBXL13 CHGB TRH TLX1NB ANK1 EREG CDKN2A FUT6 BTN1A1
# 0.2981869 0.2721195 0.2404624 0.2369656 0.2262180 0.2175203 0.2165480 0.2149584 0.2103572 0.2095228
x<-expmat["ESR1",]
y<-expmat["SCG3",]
pcc<-cor(x,y)
scc<-cor(x,y,method="spearman")
plot(x,y,pch=20,xlab="ESR1",ylab="SCG3")
title(paste0("PCC=",signif(pcc,4),"\nSCC=",signif(scc,4)))
84/80
############################## Partial Correlation
load(url("https://www.dropbox.com/s/nrxrsq8m0gfjwqh/tcga_BRCA-expmat.rda?dl=1"))
pcor<-function(x,y,z){
num<-cor(x,y)-cor(x,z)*cor(y,z)
den<-sqrt((1-cor(x,z)^2)*(1-cor(y,z)^2))
return(num/den)
}
x<-expmat["SNAI2",]
y<-expmat["ZEB1",]
z<-expmat["JCAD",]
cor(x,y)
cor(x,z)
cor(y,z)
pcor(x,y,z)
pcor(x,z,y)
pcor(y,z,x)
Tricks (Partial Correlation)
85/80
####################################################
################### MI
n<-300
set.seed(1)
x<-runif(n,-4,4)
y<-abs(rnorm(n)+x^3)
plot(x,y,pch=20)
nrbins<-round(n^(1/3))
cor(x,y) # 0.07353703
cor(x,y,method="spearman") # -0.01902377
mi(x,y) # 0.7567252
mi<-function(x,y){
# Binning
xbinned<-cut(x,nrbins)
ybinned<-cut(y,nrbins)
# Frequency of individual variables
xfreq<-table(xbinned)/n
yfreq<-table(ybinned)/n
# Entropy of individual variables
hx<-0
for(p in xfreq){
growing_h<-(p)*log(p)
if(p==0){growing_h<-0}
hx<-hx+growing_h
}
hy<-0
for(p in yfreq){
growing_h<-p*log(p)
if(p==0){growing_h<-0}
hy<-hy+growing_h
}
hx<-(-hx)
hy<-(-hy)
# Binning of joint distribution XY
xybinned<-table(xbinned,ybinned)
# Entropy of joint distribution XY
xyfreq<-c()
for(i in 1:nrow(xybinned)){
for(j in 1:ncol(xybinned)){
xyfreq<-c(xyfreq,xybinned[i,j]/n)
}
}
hxy<-0
for(p in xyfreq){
growing_h<-p*log(p)
if(p==0){growing_h<-0}
hxy<-hxy+growing_h
}
hxy<-(-hxy)
# Mutual Information
mi_xy<-hx+hy-hxy
return(mi_xy)
}
Tricks (Mutual Information Function)
86/80
########## PCC vs. Mutual info
mygep<-expmat["ESR1",]
## Correlation, for loop
pccs<-c()
for(i in 1:nrow(expmat)){
newgep<-expmat[i,]
pcc<-cor(mygep,newgep)
pccs<-c(pccs,pcc)
}
names(pccs)<-rownames(expmat)
## MI. for loop
mis<-c()
for(i in 1:nrow(expmat)){
newgep<-expmat[i,]
here_mi<-mi(mygep,newgep)
mis<-c(mis,here_mi)
}
names(mis)<-rownames(expmat)
## Compare MI vs PCC
pccs<-pccs[names(pccs)!="ESR1"]
mis<-mis[names(mis)!="ESR1"]
plot(pccs,mis,xlab="Pearson Correlation",ylab="Mutual Information")
# Select genes with low PCC
candidates1<-names(pccs[abs(pccs)<0.1])
# Select genes with high MI
candidates2<-names(mis[abs(mis)>0.18])
# Intersection
candidates<-intersect(candidates1,candidates2)
candidates # "NDN"
x<-expmat["ESR1",]
y<-expmat["NDN",]
pcc<-cor(x,y)
scc<-cor(x,y,method="spearman")
mic<-mi(x,y)
plot(x,y,pch=20,xlab="ESR1",ylab="NDN")
title(paste0("PCC=",signif(pcc,4),"\nSCC=",signif(scc,4),"\nMIC=",signif(mic,4)))
Tricks (Mutual Information vs PCC)
87/80
88/80
Exercises!
• Do something
89/80
Solutions
# Some solution
s<-5