BIOINFORMATICS LAB Episode VI Gene Networks with R · BIOINFORMATICS LAB Episode VI ... X3 Level....

BIOINFORMATICS LAB

Episode VI – Gene Networks

with R

Federico M. Giorgi, PhDChiara Cabrelle, TA

Department of Pharmacy and Biotechnology

First Cycle Degree in Genomics

2/80

R Intro Recap (1)

# Vector

length(vector)

names(vector)

# Matrix

nrow(matrix)

ncol(matrix)

dim(matrix)

rownames(matrix)

colnames(matrix)

# Concatenation

longer_vector<-c(vector,vector)

matrix<-cbind(vector1,vector2) # Column Bindlarge_matrix<-rbind(matrix1,matrix2) # Row Bind

large_matrix<-cbind(matrix1,matrix2) # Column Bind

Object types

3/80

R Intro Recap (1)

# Vector

length(vector)

names(vector)

# Matrix

nrow(matrix)

ncol(matrix)

dim(matrix)

rownames(matrix)

colnames(matrix)

# Concatenation

longer_vector<-c(vector,vector)

matrix<-cbind(vector1,vector2) # Column Bind

large_matrix<-rbind(matrix1,matrix2) # Row Bindlarge_matrix<-cbind(matrix1,matrix2) # Column Bind

Object types

4/80

R Intro Recap (2)

# Structure

function_name<-function(param1,param2,param3){

result<-operation(param1,param2)

result<-result+param3

return(result)

}

Custom Functions

# Install Libraries (only once)

source("https://bioconductor.org/biocLite.R")

biocLite("enrichplot")

# Load Libraries (Every time you restart R)

library("enrichplot")

# Help

?fisher.test # Guide for a function

??fish # Fuzzy search

browseVignettes("enrichplot") # Full manual (for good libraries)

Convenience Commands

5/80

R Intro Recap (3)

# While

i<-0

while(i<10){

i<-i+1

message(i)

}

# For

for(i in 1:10){

message(i)

}

# The apply trick

X<-matrix(rnorm(5*4),nrow=5,ncol=4)

apply(X,1,sum)

apply(X,2,sum)

# Nested For Loop

for(i in 1:nrow(X)){

for(j in 1:ncol(X)){

message(i," and ",j)

}

}

Loops

6/80

R Intro Recap (4)

# Read Text files

input<-read.csv("file.csv",as.is=TRUE)

input<-read.delim("file.txt",as.is=TRUE,sep="\t")

# Write Text files

write.csv(object,file="output.csv")

write.table(object,file="output.txt")

# Import Rdata files

load("input.rda")

# Save Rdata files

save(object1,object2,"input.rda")

Reading/Writing from files

7/80

# Scatterplot or XY plot

x<-rnorm(100)

y<-rnorm(100,mean=1)+x

plot(x,y) # Simple one

# Plot Beautification

plot(x,y,pch=19,col="navy",main="Scatterplot",xlab="Independent Variable",ylab="Dependent Variable")

grid()

mtext("Subtitle")

text(1,-3,"A word")

lm1<-lm(y~x) # A regression line

abline(lm1,lwd=3,col="red")

R Intro Recap (5)Scatterplots: Relationships between Two Variables

8/80

# Boxplot

boxplot(x,y,2*y,col=c("green","orange","red"),main="Boxplot",names=c("X","Y","2Y"))

R Intro Recap (6)Boxplots: comparing multiple (1-10) variables

Median

9/80

# Boxplot

boxplot(x,y,2*y,col=c("green","orange","red"),main="Boxplot",names=c("X","Y","2Y"))

R Intro Recap (6)Boxplots: comparing multiple (1-10) variables

Default R Boxplots Features

Median

1st Quartile

3rd Quartile

InterQuartile Range (IQR)

Upper Whisker (falls on the closest point at 3rdQ + 1.5IQR)

Lower Whisker (falls on the closest point at 1stQ + 1.5IQR)

Outliers

10/80

# Matrix with 4 correlated variables, 30 observations (or samples)

x1<-rnorm(30)

x2<-x1+rnorm(30,sd=0.1)

x3<-rnorm(30,sd=0.1)

x4<-rnorm(30)-x1

matrix<-cbind(x1,x2,x3,x4)

# Matplot

matplot(matrix,type="l",lwd=2,main="Matplot")

legend("topright",legend=paste0("X",1:4),col=1:4,lty=1:4,lwd=3)

Visualizing Gene Expression ProfilesMatplots: comparing multiple variables

Ge

ne

Exp

ressio

n L

eve

l

11/80


x1<-rnorm(30)

x2<-x1+rnorm(30,sd=0.1)


x4<-rnorm(30)-x1


# Matplot




X1

X2

X4

X3

Ge

ne

Exp

ressio

n L

eve

l

12/80


x1<-rnorm(30)

x2<-x1+rnorm(30,sd=0.1)


x4<-rnorm(30)-x1


# Matplot




X1

X2

X4

X3

+

Ge

ne

Exp

ressio

n L

eve

l

13/80


x1<-rnorm(30)

x2<-x1+rnorm(30,sd=0.1)


x4<-rnorm(30)-x1


# Matplot




X1

X2

X4

X3

-+

Ge

ne

Exp

ressio

n L

eve

l

14/80


x1<-rnorm(30)

x2<-x1+rnorm(30,sd=0.1)


x4<-rnorm(30)-x1


# Matplot




X1

X2

X4

X3

Ge

ne

Exp

ressio

n L

eve

l

15/80


x1<-rnorm(30)

x2<-x1+rnorm(30,sd=0.1)


x4<-rnorm(30)-x1


# Matplot




X1

X2

X4

X3

Ge

ne

Exp

ressio

n L

eve

l

16/80


x1<-rnorm(30)

x2<-x1+rnorm(30,sd=0.1)


x4<-rnorm(30)-x1


# Matplot




X1

X2

X4

X3

Ge

ne

Exp

ressio

n L

eve

l

17/80


x1<-rnorm(30)

x2<-x1+rnorm(30,sd=0.1)


x4<-rnorm(30)-x1


# Matplot




X1

X2

X4

X3

Ge

ne

Exp

ressio

n L

eve

l

18/80


x1<-rnorm(30)

x2<-x1+rnorm(30,sd=0.1)


x4<-rnorm(30)-x1


# Matplot




X1

X2

X4

X3

Ge

ne

Exp

ressio

n L

eve

l

19/80


x1<-rnorm(30)

x2<-x1+rnorm(30,sd=0.1)


x4<-rnorm(30)-x1


# Matplot




X1

X2

X4

X3

Ge

ne

Exp

ressio

n L

eve

l

20/80

Transcriptional Regulation

DNA

21/80


DNAGene 1 Gene 2 Gene 3 Gene 21000

22/80



Gene Promoters

23/80



TFTranscription

Factor

(Activated)

TF1

24/80



TF1 TF2

25/80



TF1 TF2

Gene 1 RNA Gene 3 RNA

26/80



TF1 TF2


Gene 1 Protein Gene 3 Protein

27/80



TF1 TF2


Gene 1 Protein Gene 3 ProteinEffect

(e.g. Proliferation)

28/80



TF1 TF2


Gene 1 Protein Gene 3 ProteinEffect

(e.g. Proliferation)

Gene 1 Gene 3TF1 TF2

29/80

Gene Regulatory Networks

Network Representation of

Transcriptional Regulation in Nonnocloropsis oceanica

(Hu et al., 2014, PMID 24965723)

Yellow: Transcription Factors (TFs)

Pink: Target Genes (TGs)

30/80


Network Representation of

Transcriptional Regulation in Nonnocloropsis oceanica

(Hu et al., 2014, PMID 24965723)

Yellow: Transcription Factors (TFs)

Pink: Target Genes (TGs)

31/80


DataGene Regulatory Network

32/80


Gene Regulatory Network Data

Network Reverse

Engineering

33/80



Network Reverse

Engineering

TF

μRNA

TG

Nodes:

Activation

Edge

Repression

Edge

C E

N N

Undirected Edge

Directed Edge

Node A

Node B

Node C

34/80



Network Reverse

Engineering

TF

μRNA

TG

Nodes:

Activation

Edge

Repression

Edge

C E

N N

Undirected Edge

Directed Edge

Node A

Node B

Node C

35/80

• Coexpression-Based

• Motif-Based

• ChIP-Seq-Based

• Orthology-Based

• Literature-Based

• Integrated

Methods to Reverse-Engineer Gene Regulatory Networks

36/80

Network Reverse Engineering via Orthology

Figure by Laura Scalambra

Inference of Orthology

TF

TF

TF

TF

TF

TG

TF

TG

Species 1

Species 2

Phylogenetic Analysis

sequence conservation

pattern analysis

GRN Edge Prediction

TG TG

T

G

Species 1

Species 2

TF TG

TF

TF TG

TF TG

TG ?

Experimental

ValidationNew

Information

TF Homo sapiens

TF Mus Musculus

TF Bos Taurus

TF Danio Rerio

Orthology Relationship

Regulatory Relationship

(Experimentally Validated)

A B C

TG

? TG ?

Regulatory Relationship

(Hypothesized)

37/80

Network Reverse Engineering via Orthology

Figure by Laura Scalambra

38/80

Network Reverse Engineering via Co-expression

Gene A Gene B

Common causee.g. Cellulose Synthase (CESA) complex

Gene A

Gene B

Gene C

Cause-Effect

Gene C

CESA2/5/6/9

CESA2/5/6/9

CESA1

CESA3

CESA1

CESA1

AP2

e.g. UDP-L-Rhamnose synthesis

GL2 RHM2

TF TF enzyme

39/80

Example of Coexpressed GenesCellulose Synthase Complex (CESAs) genes

Samples

Matplot Scatterplot

40/80

Transcriptome-wide Reverse Engineering

Evaluate all gene pairs associations

Coexpression-basedReverse Engineering

methods

41/80

Coexpression Evaluation Methods

1. Correlation(Pearson, Spearman, ...)

2. Mutual Information3. Linear Regression4. ...

General methods

Gene A

Conditional methods

1. Partial Correlation2. Partial Mutual Information3. LASSO regression4. Data Processing Inequality5. ...

Gene B Gene C

Gene A

Gene B

Gene C

Gene C

Gene A

Gene B

42/80

• Synonims: PCC, rxy, Product Moment Correlation Coefficient,

"Correlation"

• It calculates the linear dependency between two variables

• It Ranges from -1 to 1

Pearson Correlation Coefficient

1 0 -1-0.70.7

43/80

Pearson Correlation Coefficient: Visual Examples

44/80

Pearson Correlation Coefficient: Visual Examples

45/80

Pearson Correlation Coefficient: the Formula

Using as numeric variables Gene Expression Profiles (GEPs)

Two variables (GEPs for two genes): x and y

Number of samples (observations): n

The mean of the x gene expression across samples

The mean of the y gene expression across samples

46/80

How to calculate Pearson Correlation in R

► You could of course write a Correlation function, but the one in R is highly optimized

(also for all-vs-all when lots of variables are present, e.g. in Transcriptomics)

► Let's generate two random (but correlated) variables. Last time before real data.

► In this example, x is independent, and y depends on x plus some noise

x<-rnorm(1000)

y<-x+rnorm(1000,sd=1)

plot(x,y,cex=2,pch=20)

cor(x,y) # r=0.72

► The entity of the noise determines the PCC (in this case, being gaussianly-distributed

data, the SD)

x<-rnorm(1000)

y<-x+rnorm(1000,sd=0.1)

plot(x,y,pch=20)

cor(x,y) # r=0.99

x<-rnorm(1000)


plot(x,y,pch=20)

cor(x,y) # r=0.41

47/80

Pearson Correlation Coefficient: Significance

► We Observe a Correlation. Can we say from the r value if it's: high, low or significant?

► Short Answer: NO

► The significant also depends on the number of observations. The more, the more robust

the PCC and the higher the significance

► p-value: Chance to obtain the same PCC (or higher) with random data of identical

size (nr. of samples)

► The lower the p-value, the higher the significance

x<-rnorm(1000)


cor.test(x,y) # r=0.44, p<7.5x10-8

par(mfrow=c(2,3))

for(n in c(10,20,50,100,500,1000)){

x<-rnorm(n)

y<-x+rnorm(n,sd=1)

r<-signif(cor(x,y),4)

p<-signif(cor.test(x,y)$p.value,4)

title<-paste0("n=",n," r=",r," p=",p)

plot(x,y,main=title,pch=20)

}

48/80

The Estrogen Receptor (ER)

ER

• ER Drives Proliferation in

endometrium, breast,

ovarian and

hypothalamus

• Activator of Transcription

(occasionally repressor)

• Driver of

• Recognizes Estrogen

Responsive Elements in

promoters of TGs

49/80

The Tumor Master Regulator Hypothesis

Effector

genes(Whole genome)

(epi)Genom

ic events(Mutations, CNVs,

methylations)

TF TFTF

Master

Regulators(Transcription Factors)

Proliferation Migration

50/80

The Tumor Master Regulator Hypothesis

Effector

genes(Whole genome)

(epi)Genom

ic events(Mutations, CNVs,

methylations)

TF TFTF

Master

Regulators(Transcription Factors)

Proliferation Migration

Glioblastoma:

Carro MS et al. Nature. 2010 Jan 21;463(7279):318-

25.

Chen J et al, Nature. 2014 159(2): 402-14.

Tumor Checkpoint: CEBPb/d and Stat3

Alteration KLHL9

Diffuse Large B Cell Lymphoma:

Compagno M et al. Nature. 2009 Jun

4;459(7247):717-21

Tumor Checkpoint: Nf-kB

Alterations: CARD11, A20, …

GC-Resistance in T-ALL:

Real PJ et al. Cancer Cell. 2013 Dec 9;24(6)

Tumor Checkpoint: NOTCH1/Akt1 pathways

Alterations: Pten, Pi3k

T-ALL Tumorigenesis:

Real PJ et al. Nat Med. 2009 Jan;15(1):50-8.

Dalla Gatta G et al. Nature Medicine, 2012 Feb

26;18(3)

Tumor Checkpoint: TLX1, TLX3, RUNX1

Alterations: TLX1, TLX3, RUNX1

Malignant Prostate Cancer:

Aytes et al. Cancer Cell 25, 638-651 (2014)

Tumor Checkpoint: FOXM1 and CENPF

Follicular Lymphoma progression to DLBCL

• Bisikirska B. et al., Cancer Research in press (2015)

• Tumor Checkpoint: FOXM1, TFDP1, ATF5,

HMGA1, NFYB

TFs Master Regulators of

Luminal Breast Cancer:

ER + GATA3 + FOXA1(Fletcher et al., Nat Commun. 2013)

51/80

Correlation Requires Data

A dataset: the TCGA Breast Cancer RNA-Seq dataset

~1200 patients

~20k genes

Gene

Expression

Values

R object:

• An Expression Matrix• expmat

Fields:

• rownames(expmat): Gene Symbols

• colnames(expmat): TCGA Sample IDs

52/80

Exercises! A simple Pearson Correlation Network1. Get the second biggest Breast Cancer Gene Expression Dataset ever generated (~2mins download):

load(url("https://www.dropbox.com/s/nrxrsq8m0gfjwqh/tcga_BRCA-expmat.rda?dl=1"))

2. Describe it (number of genes, number of samples)

– Always visualize a slice of a big dataset to familiarize with dimension names and data format, e.g. with expmat[1:5,1:5]

– Plot a single gene as a sanity check, e.g. plot(expmat["EGFR",],type="line")

3. Find the 100 genes most correlated (Pearson) with ER: the Estrogen Receptor 1 TF

(use the Genecards website to find the ER gene name). Plot some ER-gene scatterplot. Check the

correlation between ER and some of its known interactors: FOXA1, GATA3.

53/80

Exercises! A simple Pearson Correlation Network4. Compare the top 100 ER-positively correlated TGs in your network with the genes controlled by ER and defined

experimentally by the MsigDB Pathway Database. Is the overlap greater than expected by chance?

– Hint1: click on Show Members

– Hint2: use Notepad++ and read.delim()

– Remember fisher.test(ctable,alternative="greater")ER Targets

Experimentally

Known

(MsigDB)

ER

Positive

Correlators

If you cannot access the website, the

database ER pathway genes are here

http://software.broadinstitute.org/gsea/msigdb/cards/HALLMARK_ESTROGEN_RESPONSE_LATE

https://www.dropbox.com/s/a9afwfaqsxos2at/ertargets.txt?dl=0

54/80

Exercises! A simple Pearson Correlation Network

5. Define a Hypothetical Gene Network Centered around ER

Distinguishing:Positively correlated (TGs activated by ER)

Negatively correlated (TGs repressed by ER)

Bonus points: use the igraph library to draw a network of

positively correlated ER targets with edges thickness proportional

to the correlation coefficient

A common table representation for a network is this:

TF TG Weight

ESR1 AGR2 0.xxx

ESR1 MAPT 0.xyx

ESR1 ARSG 0.yxx

ESR1 AFF3 0.xzx

ESR1 THSD4 0.zxy

Where TF is a Transcription Factor, TG is a Target Gene and

Weight in this case is the strength of the correlation

https://stackoverflow.com/questions/43654779/plot-a-simple-network-graph-in-r

55/80

Technical Details and Tips1. The dataset contains normalized expression data:

1. Samples collected by The Cancer Genome Atlas initiative and analyzed in this paper

2. RNA-Seq

3. Patient-Derived

4. Breast Cancer (all subtypes)

5. Tumor tissues and Proximal Normal tissues

6. VST-normalization (raw counts is divided by sample size and then the distribution is transformed to be tested

by other tools, like DESeq2): paper here

2. If you want to do a Venn Diagram to show the overlap between coexpression and experimental

validation, try this list of nice R ways to do it

3. There are at least two ways to calculate 1-vs-all PCC in R

1. For loop (remember sort() and names()). Easy but a bit slower

2. Using cor() itself on the matrix (use ?cor and test it on small matrices): this is dangerous and will consume your RAM quickly. A common trick is to reduce the size of the expression matrix, by analyzing only the top1000

genes with the highest variance (tip: you can use a combination var(), apply(), sort() and then names() for this).

https://www.nature.com/articles/nature11412

https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106

https://stackoverflow.com/questions/8713994/venn-diagram-proportional-and-color-shading-with-semi-transparency

56/80

Spearman Correlation

• The Spearman Correlation Coefficient (SCC) calculates dependencies in data that

can be non-linear but monotonic

• Its trick is to rank-transform the initial GEP:

x<-c(0,1,5,30,40,50,100)

xrank<-rank(x)

plot(x,xrank,cex=2,pch=20,type="b")

• When compared to Pearson, Spearman can find

non-linear, exponential relationship:

x<-abs(runif(1000,2,10))

y<-x^10

pcc<-cor(x,y)

scc<-cor(x,y,method="spearman")

plot(x,y,pch=20)

title(paste0("PCC=",signif(pcc,4),"\nSCC=",signif(scc,4)))

57/80

Spearman Correlation

• Another great advantage of Spearman Correlation's Rank Transformation makes it

more robust to outliers than Pearson:

x<-c(rnorm(100),50)

y<-c(rnorm(100),50)

pcc<-cor(x,y)


plot(x,y,pch="☺")

title(paste0("PCC=",signif(pcc,4),"\n

SCC=",signif(scc,4)))

• PCC and SCC are equally accepted in scientific

papers. SCC is more refined, but a bit slower to

calculate, especially for huge analyses (millions of

features and samples)

• There is another, third and last Correlation method: Kendall Correlation (KCC)

KCC is very similar to SCC: it measures monotonic relationships. It measures the

number of co-increasing quadruplets of points, but it is slow to calculate

and nobody uses it in Science

58/80

Exercises! Pearson vs. Spearman Correlation

1. Take two random rows from the Breast Cancer expmat. Compare the output

(numerically and graphically) of cor(x,y,method="spearman") with:

– cor(rank(x),rank(y),method="pearson")

– cor(log10(x),log10(y),method="pearson")

2. Calculate all PCCs and SCCs between ESR1 and other genes

– Correlate the resulting vectors of Correlation Coefficients

– What is the ESR1 edge with the highest difference between PCC and SCC?

Purpose: find out real cases where Spearman is more robust than Pearson. This RNA-Seq dataset is

exactly the data you will get in a real job. It's not artificial. There are no perfect examples.

ESR1 ?

Edge

59/80

• Technique to remove indirect correlations

Partial Correlation

Gene X Gene Y

Gene Z0.81 0.89

0.73

Standard Correlation (zeroth order)

Partial Correlation (first order)

r_xy <- cor(x,y)

60/80

Partial Correlation

Gene X Gene Y

Gene Z0.81 0.89

0.03

Standard Correlation (zeroth order)

Partial Correlation (first order)

r_xy <- cor(x,y)

• Technique to remove indirect correlations

61/80

R example of Partial Correlation

set.seed(1)

x<-rnorm(1000)

y<-rnorm(1000)+x

z<-rnorm(1000)+y

A simple gene circuitX ZY

cor(x,y) # 0.7076544

cor(x,z) # 0.5939462

cor(y,z) # 0.828587

pcor(x,y,z) # 0.4784933

pcor(x,z,y) # 0.01919473

pcor(y,z,x) # 0.7182564

Calculate cor and partial cor (*you will need to write the pcor function!*)

X

Y

Z

62/80

R example of Partial Correlation

set.seed(1)

x<-rnorm(1000)

y<-rnorm(1000)+x

z<-rnorm(1000)+y

A simple gene circuitX ZY

cor(x,y) # 0.7076544

cor(x,z) # 0.5939462

cor(y,z) # 0.828587

pcor(x,y,z) # 0.4784933

pcor(x,z,y) # 0.01919473pcor(y,z,x) # 0.7182564

Calculate cor and partial cor (*you will need to write the pcor function!*)

X

Y

Z

63/80

Exercises! Partial Correlation

pcor<-function(x,y,z){}


1) Load the TCGA Breast Cancer Expression Matrix:

2) Write a function to calculate partial correlation:

Test your function on three genes of your choice.

To test if your result is correct, use this website to calculate pcor from standard PCCs (r=PCC)

3) Using Partial Correlation, remove the indirect edge from these small gene networks:

In blue, Target Genes

In orange: Transcription Factors

GATA3

BCL2

ESR1 GATA3

SCUBE2

RARA E2F3

ODC1

MYC

link

ZEB1

JCAD

SNAI2

http://vassarstats.net/par.html

https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4694822/figure/F2/

64/80

Mutual Information

X

• Mutual Information is a very sophisticated method that is able to find ANY relationship

structure between two variables

• Mutual Information is a method designed for

categorical data. I.e. with finite distinct outcomes

("red", "blue", "green", etc.)

• GEPs are continuous variables with n observation

• The conversion of rank-transformed continuous to

categorical data requires a process called binning:

Y

The number of bins can vary.

In the common algorithm,

the nr. bins = the cube root of the nr. obs (n), rounded.

65/80

Mutual Information: calculation

X

• Once binning is defined, one requires to calculate the entropy (H)

of each variable (X and Y) and the entropy of the joint distribution

(x,y) with the Shannon's formula:

• Mutual Information between X and Y is defined as such:

• In other words: if the disorder of the pair X-Y is lower than the sum of the disorder in the

separated variables, then there is a dependency between X and Y

66/80

Mutual Information in Real Life

"Oh no! I dOn't uNdErStaNd wHy wE aRe stuDyiNg this!

I want to learn useful stuff that I will use in real life!"

67/80

Mutual Information in Real Life

"Oh no! I dOn't uNdErStaNd wHy wE aRe stuDyiNg this!

I want to learn useful stuff that I will use in real life!"

>50,000

citations!!!

68/80

Mutual Information vs. PCC

Pearson correlation

Mu

tua

l in

form

atio

nDevelopmental dataset (Arabidopsis)

Pearson = 0.15

M.I . = 1.04-0.5 0 0.5

0.2

0.4

0.6

0.8

1.0

1.2

69/80

Mutual Information: Practical Solution

70/80


71/80


72/80


73/80


74/80


75/80


76/80


77/80


78/80

Exercises! Mutual Information

1. (Long): Write a function to calculate Mutual Information between these two variables

(set nr bins = n^(1/3)). Tips: cut() for binning and table() for calculating occurrences.

– A correct function should give maximum self-MI: mi(x,x)=1.94

– The entropy is a sum of terms of the form p log(p). When p=0 you instead use the limiting value

(as p approaches 0 from above), which is 0

n<-300

set.seed(1)

x<-runif(n,-4,4)

y<-abs(rnorm(n)+x^3)

# Expected results:

cor(x,y,method="p") # 0.07353703

cor(x,y,method="s") # -0.01902377

mi(x,y) # 0.7567252

79/80

Exercises! Mutual Information2. (Long): repeat my Arabidopsis analysis comparing PCC vs Mutual Info in the Breast cancer dataset.

Below is how it will look like for ESR1 edges only. However, we are looking for something more

diverging, if possible. Try as many genes as you can (the challenge is to test every possible gene-

gene pair without running out of RAM). An edge that is non-linear, non-monotonic, that can

underly a non-canonical regulatory mechanism.

All ESR1 correlations (except ESR1 vs ESR1)

→ Every point in this plot is a gene pair!

80/80

Exercises! Mutual Information2. (Long): repeat my Arabidopsis analysis comparing PCC vs Mutual Info in the Breast cancer dataset.

Below is how it will look like for ESR1 edges only. However, we are looking for something more

diverging, if possible. Try as many genes as you can (the challenge is to test every possible gene-

gene pair without running out of RAM). An edge that is non-linear, non-monotonic, that can

underly a non-canonical regulatory mechanism.

?

All ESR1 correlations (except ESR1 vs ESR1)

→ Every point in this plot is a gene pair!

www.giorgilab.org

Federico M. Giorgi, PhD

Department of Pharmacy and Biotechnology

[email protected]

to EF, bringer of salted breakfasts

82/80

Tricks (Pearson Correlation Exercise)

##################

load(url("https://www.dropbox.com/s/nrxrsq8m0gfjwqh/tcga_BRCA-

expmat.rda?dl=1"))

### Play

plot(expmat["EGFR",],type="line")

# Gene Expression Profile (GEP)

mygep<-expmat["ESR1",]

# ## Correlation, matrix way

# # Keep only the top variance genes

# vars<-apply(expmat,1,var)

# vars<-sort(vars,decreasing=TRUE)

# topvars<-names(vars)[1:1000]

# # Correlation

# submat<-expmat[topvars,]

# cormat<-cor(t(submat))

# cors<-cormat["ESR1",]

# cor_genes<-names(sort(cors,decreasing=TRUE))[1:30]

## Correlation, for loop

cors<-c()

for(i in 1:nrow(expmat)){

newgep<-expmat[i,]

pcc<-cor(mygep,newgep)

cors<-c(cors,pcc)

}

names(cors)<-rownames(expmat)

cor_genes<-names(sort(cors,decreasing=TRUE))[1:100]

# Plot them

othergep<-cor_genes[2]

plot(mygep,expmat[othergep,],xlab="ESR1",ylab=othergep)

# Draw the network

library(igraph)

TF<-rep("ESR1",length(cor_genes))

TG<-cor_genes

Weight<-cors[cor_genes]

mydf<-cbind(TF,TG,Weight)

mydf<-as.data.frame(mydf,as.is=TRUE)

mydf[,3]

g <- graph_from_data_frame(mydf, directed=TRUE)

g <- set_edge_attr(g, "weight", value = mydf$Weight)

# ?plot.igraph

png("plots/igraph.png",w=3000,h=3000,res=300)

plot(g, edge.width = E(g)$weight/5, edge.label=E(g)$weight) # with 30 it's nicer

dev.off()

### Load the motif-based target list from MSigDB

http://software.broadinstitute.org/gsea/msigdb/cards/HALLMARK_ESTROGEN_RESPONSE_LATE

raw<-read.delim("data/06_networks/ertargets.txt",as.is=TRUE)

db_genes<-raw[,1]

## Enrichment. Expected test

ul<-length(intersect(db_genes,cor_genes))

100*ul/length(db_genes) # 6.5% of db_genes in our prediction

100*ul/length(cor_genes) # 13% of cor_genes in database

length(cor_genes)/nrow(expmat) # 0.5% of cor_genes

## Contingency table: upper-left, upper-right, down-left, down-right

ul<-intersect(db_genes,cor_genes)

ur<-length(setdiff(db_genes,ul))

dl<-length(setdiff(cor_genes,ul))

ul<-length(ul)

dr<-nrow(expmat)-ul-ur-dl # the "universe"

ctable<-rbind(c(ul,ur),c(dl,dr))

fisher.test(ctable,alternative="greater")

83/80

Tricks (Pearson vs Spearman)

############# PCC vs SCC

mygene<-"ESR1"

mygep<-expmat[mygene,]

pccs<-cor(mygep,t(expmat),method="p")

sccs<-cor(mygep,t(expmat),method="s")

plot(pccs,sccs)

# ESR1-other with the highest PCC vs. SCC difference

diffs<-abs(pccs-sccs)

names(diffs)<-rownames(expmat)

diffs<-sort(diffs,decreasing=TRUE)[1:10]

# SCG3 FBXL13 CHGB TRH TLX1NB ANK1 EREG CDKN2A FUT6 BTN1A1

# 0.2981869 0.2721195 0.2404624 0.2369656 0.2262180 0.2175203 0.2165480 0.2149584 0.2103572 0.2095228

x<-expmat["ESR1",]

y<-expmat["SCG3",]

pcc<-cor(x,y)


plot(x,y,pch=20,xlab="ESR1",ylab="SCG3")

title(paste0("PCC=",signif(pcc,4),"\nSCC=",signif(scc,4)))

84/80

############################## Partial Correlation


pcor<-function(x,y,z){

num<-cor(x,y)-cor(x,z)*cor(y,z)

den<-sqrt((1-cor(x,z)^2)*(1-cor(y,z)^2))

return(num/den)

}

x<-expmat["SNAI2",]

y<-expmat["ZEB1",]

z<-expmat["JCAD",]

cor(x,y)

cor(x,z)

cor(y,z)

pcor(x,y,z)

pcor(x,z,y)

pcor(y,z,x)

Tricks (Partial Correlation)

85/80

####################################################

################### MI

n<-300

set.seed(1)

x<-runif(n,-4,4)

y<-abs(rnorm(n)+x^3)

plot(x,y,pch=20)

nrbins<-round(n^(1/3))

cor(x,y) # 0.07353703

cor(x,y,method="spearman") # -0.01902377

mi(x,y) # 0.7567252

mi<-function(x,y){

# Binning

xbinned<-cut(x,nrbins)

ybinned<-cut(y,nrbins)

# Frequency of individual variables

xfreq<-table(xbinned)/n

yfreq<-table(ybinned)/n

# Entropy of individual variables

hx<-0

for(p in xfreq){

growing_h<-(p)*log(p)

if(p==0){growing_h<-0}

hx<-hx+growing_h

}

hy<-0

for(p in yfreq){

growing_h<-p*log(p)


hy<-hy+growing_h

}

hx<-(-hx)

hy<-(-hy)

# Binning of joint distribution XY

xybinned<-table(xbinned,ybinned)

# Entropy of joint distribution XY

xyfreq<-c()

for(i in 1:nrow(xybinned)){

for(j in 1:ncol(xybinned)){

xyfreq<-c(xyfreq,xybinned[i,j]/n)

}

}

hxy<-0

for(p in xyfreq){

growing_h<-p*log(p)


hxy<-hxy+growing_h

}

hxy<-(-hxy)

# Mutual Information

mi_xy<-hx+hy-hxy

return(mi_xy)

}

Tricks (Mutual Information Function)

86/80

########## PCC vs. Mutual info

mygep<-expmat["ESR1",]

## Correlation, for loop

pccs<-c()


newgep<-expmat[i,]

pcc<-cor(mygep,newgep)

pccs<-c(pccs,pcc)

}

names(pccs)<-rownames(expmat)

## MI. for loop

mis<-c()


newgep<-expmat[i,]

here_mi<-mi(mygep,newgep)

mis<-c(mis,here_mi)

}

names(mis)<-rownames(expmat)

## Compare MI vs PCC

pccs<-pccs[names(pccs)!="ESR1"]

mis<-mis[names(mis)!="ESR1"]

plot(pccs,mis,xlab="Pearson Correlation",ylab="Mutual Information")

# Select genes with low PCC

candidates1<-names(pccs[abs(pccs)<0.1])

# Select genes with high MI

candidates2<-names(mis[abs(mis)>0.18])

# Intersection

candidates<-intersect(candidates1,candidates2)

candidates # "NDN"

x<-expmat["ESR1",]

y<-expmat["NDN",]

pcc<-cor(x,y)


mic<-mi(x,y)

plot(x,y,pch=20,xlab="ESR1",ylab="NDN")

title(paste0("PCC=",signif(pcc,4),"\nSCC=",signif(scc,4),"\nMIC=",signif(mic,4)))

Tricks (Mutual Information vs PCC)

88/80

Exercises!

• Do something

89/80

Solutions

# Some solution

s<-5

BIOINFORMATICS LAB Episode VI Gene Networks with R · BIOINFORMATICS LAB Episode VI ... X3 Level....

Documents

Transcript of BIOINFORMATICS LAB Episode VI Gene Networks with R · BIOINFORMATICS LAB Episode VI ... X3 Level....