sociogenomics2022

Lab week 5. Polygenic scores

We start from a “monogenic score” based only on 1 SNP rs9930506 the file score_rs9930506.txt includes the weights

./plink 	--bfile 1kg_hm3_qc \
 		--score Data/score_rs9930506.txt 1 2 3 \
 	--pheno Data/BMI_pheno.txt \
 	--out Results/1kg_FTOscore

This is our “monogenic score”

head Results/1kg_FTOscore.profile

To calculate PGS we need additional software:

PRSice (https://www.prsice.info)
R

cd Software
wget https://github.com/choishingwan/PRSice/releases/download/2.3.5/PRSice_linux.zip
unzip PRSice_linux.zip

Install R on the virtual machine

sudo apt-get install r-base r-base-dev

let’s see if it works

R
install.packages(c("ggplot2", "data.table"))

./PRSice_linux -h

Create symbolic links to PRSice in the Home directory

cd $HOME
cd Sociogenomics

ln -s Software/PRSice.R
ln -s Software/PRSice_linux

Setting up DATA and RESULTS folder variables

DATA="/home/nicola_barban/Sociogenomics/Data"
RESULTS="/home/nicola_barban/Sociogenomics/Results"

Calculating a Polygenic score on BMI based on BMI.txt

Rscript PRSice.R --dir . \
    --prsice ./PRSice_linux \
    --base ${DATA}/BMI.txt \
    --target ${DATA}/1kg_hm3_qc \
    --snp MarkerName \
    --A1 A1 \
    --A2 A2 \
    --stat Beta \
    --pvalue Pval \
    --pheno-file ${DATA}/BMI_pheno.txt \
    --bar-levels 1 \
    --fastscore \
    --binary-target F \
    --out ${RESULTS}/BMI_score_all

See the ouput. there are 9 duplicated SNPs that can be removed with the command extract

Rscript PRSice.R --dir . \
    --prsice ./PRSice_linux \
    --base ${DATA}/BMI.txt \
    --target ${DATA}/1kg_hm3_qc \
    --snp MarkerName \
    --A1 A1 \
    --A2 A2 \
    --stat Beta \
    --pvalue Pval \
    --pheno-file ${DATA}/BMI_pheno.txt \
    --bar-levels 1 \
    --fastscore \
    --binary-target F \
    --extract ${RESULTS}/BMI_score_all.valid \
    --out ${RESULTS}/BMI_score_all

Calculating different scores at various pvalues

Rscript PRSice.R --dir . \
    --prsice ./PRSice_linux \
    --base ${DATA}/BMI.txt \
    --target ${DATA}/1kg_hm3_qc \
    --thread 1 \
    --snp MarkerName \
    --A1 A1 \
    --A2 A2 \
    --stat Beta \
    --pvalue Pval \
    --bar-levels 5e-08,5e-07,5e-06,5e-05,5e-04,5e-03,5e-02,5e-01 \
    --fastscore \
    --all-score \
    --no-regress \
    --binary-target F \
    --extract ${RESULTS}/BMI_score_all.valid \
    --out ${RESULTS}/BMIscore_thresholds  

See results

head ${RESULTS}/BMIscore_thresholds.all_score

Rscript PRSice.R --dir . \
    --prsice ./PRSice_linux\
    --base ${DATA}/BMI.txt \
    --target ${DATA}/1kg_hm3_qc \
    --thread 1 \
    --snp MarkerName \
    --A1 A1 \
    --A2 A2 \
    --stat Beta \
    --pvalue Pval \
    --pheno-file ${DATA}/BMI_pheno.txt \
    --interval 0.00005 \
    --lower 0.0001 \
    --quantile 20 \
    --all-score \
    --binary-target F \
    --extract ${RESULTS}/BMI_score_all.valid \
    --out ${RESULTS}/BMIscore_graphics

PGS on binary trait

Rscript PRSice.R --dir . \
    --prsice ./PRSice_linux \
    --base ${DATA}/BMI.txt \
    --target ${DATA}/1kg_hm3_qc \
    --thread 1 \
    --snp MarkerName \
    --A1 A1 \
    --A2 A2 \
    --stat Beta \
    --pvalue Pval \
    --no-clump F \
    --pheno-file ${DATA}/Obesity_pheno.txt \
    --interval 0.00005 \
    --lower 0.0001 \
    --quantile 5 \
    --all-score \
    --binary-target T \
    --extract ${RESULTS}/BMI_score_all.valid \
    --out ${RESULTS}/Obesity_score_graphics

THIS NEED TO BE EXECUTED IN R

#import external data
data<-read.table("BMIscore_thresholds.all_score", header=T)

# show first rows of data
head(data)

# calculate new standardised variable
data$PGS=(data$Pt_1-mean(data$Pt_1))/sd(data$Pt_1, na.rm=T)

#plot histogram of the polygenic score 
 hist(data$PGS)

# import external data with phenotype
pheno_BMI<-read.table("../Data/BMI_pheno.txt", header=T)

# merge the two datasets
data.with.pheno<-merge(data,pheno_BMI, by="IID")

# run a linear regression model
mod1<-(lm(BMI~PGS, data=data.with.pheno))
summary(mod1)


summary(mod1)$r.square

# create a vector with column names
columns=c("FID", "IID", "pca1", "pca2", "pca3", "pca4", "pca5", "pca6", "pca7", "pca8", "pca9", "pca10")

# read external data with PCAs
pca<- read.table(file="../Data/pca.eigenvec", sep = "", header=F, col.names=columns)[,c(2:12)]

# merge file with covariates with the rest of the file
data.with.covars<-merge(data.with.pheno,pca, by="IID")

# Estimate new linear regression model
mod2<-lm(BMI~PGS+pca1+pca2+pca3+pca3+pca5+pca6+pca7+pca8+pca9+pca10, data=data.with.covars)

# Calculate R-Square
summary(mod2)$r.square

# Estimate linear regression model
mod2.no.pgs<-update(mod2,  ~ . - PGS)

# Estimate difference in R-squared
print(summary(mod2)$r.square-summary(mod2.no.pgs)$r.square)
# install new package in R to perform bootstrap

# install new package in R to perform bootstrap

install.packages("boot")
library(boot)
set.seed(12345)

# define new function to calculate differential r-squared.
# The following commands define the new function
rsq <- function(formula, PGS=PGS, data, indices) {
  d <- data[indices,] 
  fit1 <- lm(formula,  data=d)
  fit2 <- update(fit1,  ~ . + PGS)
  
  return(summary(fit2)$r.square-summary(fit1)$r.square)
} 

# bootstrapping with 1000 replications
results <- boot(data=data.with.covars, statistic=rsq, 
  	R=1000, formula=BMI~pca1+pca2+pca3+pca3+pca5+pca6+pca7+pca8+pca9+pca10, PGS=PGS)
boot.ci(results, type="norm")      

# import external data with multi-ancestry information

data<-read.table("BMIscore_MULTIANCESTRY.best", header=T)
head(data)

# Calculate  standardized PGS
data$PGS=(data$PRS-mean(data$PRS))/sd(data$PRS, na.rm=T)

data.with.pheno<-merge(data,pheno_BMI, by="IID")

#import dataset with geographical information (ancestral populations)
geo<-read.table(file="../Data/1kg_samples.txt", sep = "\t", header=T)[, c(1,4,5,6,7)]
names(geo)[1]<-"IID"

# Create a vector with new columns names
columns=c("FID", "IID", "pca1", "pca2", "pca3", "pca4", "pca5", "pca6", "pca7", "pca8", "pca9", "pca10")

# import exteral data with PCAs
pca<- read.table(file="pca.eigenvec", sep = "", header=F, col.names=columns)[,c(2:12)]

# merge information with covariates and ancestral information
data.with.covars<-merge(data.with.pheno,pca, by="IID")
data.with.geo<-merge(data.with.covars,geo, by="IID")

# fit a linear regression model
mod.AFR<-lm(BMI~PGS, data=subset(data.with.geo, Superpopulation.name=="African"))

summary(mod.AFR)$r.square

# run bootstrap function to  estimate increased R squared
results <- boot(data=subset(data.with.geo, Superpopulation.name=="African"), statistic=rsq, 
  	R=1000, formula=BMI~pca1+pca2+pca3+pca3+pca5+pca6+pca7+pca8+pca9+pca10, PGS=PGS)

boot.ci(results, type="norm")