This first part of the lab will use Rstudio.
If you are using google shell remember to install R in your system
cd /home/nicola_barban/Sociogenomics/Software
sudo apt-get install r-base r-base-dev
R
Use R to download the summary statistics
download.file("http://ssgac.org/documents/EduYears_Main.txt.gz", dest="EA2_results.txt.gz")
Import the summary statistics in R
gwasResults<-read.table("EA2_results.txt.gz", header=T)
head(gwasResults)
dim(gwasRsults)
This is a very large file, we can spped up things by selecting ony SNPS with Pvalue >0.05
gwasResults<-subset(gwasResults, Pval<0.05)
dim(gwasResults)
A first attempt to create a manhattan plot with data from one single Chromosome
library(ggplot2)
names(gwasResults)
man<-ggplot(subset(gwasResults, CHR==3), aes(x = POS, y = -log10(Pval))) +
geom_hline(yintercept = -log10(5e-08), color = "grey40", linetype = "dashed") +
geom_point(alpha = 0.75) +
labs(x = NULL,
y = "-log10(p)") +
theme_minimal()
man
We can use a library created to plot manhattan plots Load the qqman library
install.packages("qqman")
library(qqman)
Create Manhattan plot and save the figure into an external png file format
png(file="manhattan_without_highlights.png" , width = 3600, height = 2400, res=300)
manhattan(gwasResults, chr="CHR", bp="POS", snp="MarkerName", p="Pval",suggestiveline=F)
dev.off()
Now, let’s try to highlight the genome-wide significant hits. Forst let’s define hits
hits<-gwasResults[gwasResults$Pval<5e-08,]
create new variable with genome regions that we want to highlight (we used an interval of +/- 5k base pairs)
gwasResults$highlight.snps<-0
for ( i in 1: dim(hits)[1]){
chr<-hits[i,2]
loc_min<- hits[i, 3]-5000
loc_max<- hits[i, 3]+5000
neighbours.snps<-gwasResults$MarkerName[gwasResults$CHR==chr &
gwasResults$POS>loc_min &
gwasResults$POS<loc_max]
gwasResults$highlight.snps[gwasResults$MarkerName %in%
neighbours.snps] <- 1
}
Now we add the highlight option to the Manhattan plot
manhattan(gwasResults, chr="CHR", bp="POS", snp="MarkerName", p="Pval", highlight=gwasResults$MarkerName[gwasResults$highlight.snps==1],suggestiveline=F, col=1)
png(file="manhattan_with_highlights.png" , width = 3600, height = 2400, res=300)
dev.off()
We can use the same library to plot QQplots
png(file="qqplot.png" , width = 1200, height = 1200, res=300)
qq(gwasResults$Pval)
dev.off()
import data on genetic correlation
data_rg<-read.table("LD-Hub_genetic_correlation_example.txt",fill =T, sep="\t", header=T, quote="")
draw heatmap
ggplot(data = data_rg, aes(Trait1, Trait2, fill = rg))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid =
"white", midpoint = 0, limit =
c(-1.1,1.1), space = "Lab",
name="Genetic\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1))+
coord_fixed()
First, some data cleaning to free some space on the google shell account
cd /home/nicola_barban/Sociogenomics/Data/
rm *.*
cd /home/nicola_barban/Sociogenomics/Results/
rm *.*
This requires time and installs miniconda (which is a python distribution) in the system
cd /home/nicola_barban/Sociogenomics/Software
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
bash ~/miniconda.sh -b -p
rm ~/miniconda.sh
source $HOME/miniconda3/bin/activate
This installs LDSC in your system
cd /home/nicola_barban/Sociogenomics/Software
git clone https://github.com/bulik/ldsc.git
cd ldsc
Congratulations you have installed LDSC now you have to install some packeges that LDSC uses for its calculations
conda env create --file environment.yml
source activate ldsc
Now, let’s check if everything works
./ldsc.py -h
./munge_sumstats.py -h
This is the LDSC command to estimate heritability. We need the following files:
cd /home/nicola_barban/Sociogenomics
First we need to “munge” summary statistics, that is to make ti comparable with reference data that have info on LD scores
##This takes a lot of time. you can skip this and find the prepared data in the folder
python Software/ldsc/munge_sumstats.py \
--sumstats Data/Giant_Height2018.txt.gz \
--snp SNP \
--N-col N \
--a1 Tested_Allele \
--a2 Other_Allele \
--p P \
--frq Freq_Tested_Allele_in_HRS \
--out Results/height \
--merge-alleles Data/w_hm3.snplist
The folder eur_w_ld_chr/ contains the LD scores that are merged with our summary statistics
python Software/ldsc/ldsc.py \
--h2 Data/height.sumstats.gz \
--ref-ld-chr Data/eur_w_ld_chr/ \
--w-ld-chr Data/eur_w_ld_chr/ \
--out Results/height_h2
Now we can try to estimate genetic correlation between height and educational attainment.
First, we need to prepare data for Educational Attainment
##This takes a lot of time. you can skip this and find the prepared data in the folder
python Software/ldsc/munge_sumstats.py \
--sumstats Data/EA2_results.txt.gz \
--snp SNP\
--N 500000 \
--a1 A1 \
--a2 A2 \
--p Pval \
--frq EAF \
--out Reults/Educ \
--merge-alleles Data/w_hm3.snplist
And this is finally the command to estimate genetic correlation
python Software/ldsc/ldsc.py \
--rg Data/height.sumstats.gz,Data/Educ.sumstats.gz \
--ref-ld-chr Data/eur_w_ld_chr/ \
--w-ld-chr Data/eur_w_ld_chr/ \
--out Results/height_educ