Last data update: 2014.03.03
R: Principal Component Analysis (PCA) on SNP genotype data
Principal Component Analysis (PCA) on SNP genotype data
Description
To calculate the eigenvectors and eigenvalues for principal component
analysis in GWAS.
Usage
snpgdsPCA(gdsobj, sample.id=NULL, snp.id=NULL,
autosome.only=TRUE, remove.monosnp=TRUE, maf=NaN, missing.rate=NaN,
algorithm=c("exact", "randomized"),
eigen.cnt=ifelse(identical(algorithm, "randomized"), 16L, 32L),
num.thread=1L, bayesian=FALSE, need.genmat=FALSE,
genmat.only=FALSE, eigen.method=c("DSPEVX", "DSPEV"),
aux.dim=eigen.cnt*2L, iter.num=10L, verbose=TRUE)
Arguments
gdsobj
an object of class SNPGDSFileClass
,
a SNP GDS file
sample.id
a vector of sample id specifying selected samples;
if NULL, all samples are used
snp.id
a vector of snp id specifying selected SNPs; if NULL,
all SNPs are used
autosome.only
if TRUE
, use autosomal SNPs only; if it is a
numeric or character value, keep SNPs according to the specified
chromosome
remove.monosnp
if TRUE, remove monomorphic SNPs
maf
to use the SNPs with ">= maf" only; if NaN, no MAF threshold
missing.rate
to use the SNPs with "<= missing.rate" only; if NaN,
no missing threshold
eigen.cnt
output the number of eigenvectors; if eigen.cnt <= 0, then
return all eigenvectors
algorithm
"exact", traditional exact calculation; "randomized",
fast PCA with randomized algorithm introduced in Galinsky et al. 2016
num.thread
the number of (CPU) cores used; if NA
, detect
the number of cores automatically
bayesian
if TRUE, use bayesian normalization
need.genmat
if TRUE, return the genetic covariance matrix
genmat.only
return the genetic covariance matrix only, do not
compute the eigenvalues and eigenvectors
eigen.method
"DSPEVX" – compute the top eigen.cnt
eigenvalues and eigenvectors using LAPACK::DSPEVX; "DSPEV" – to be
compatible with SNPRelate_1.1.6 or earlier, using LAPACK::DSPEV;
"DSPEVX" is significantly faster than "DSPEV" if only top principal
components are of interest
aux.dim
auxiliary dimension used in fast randomized algorithm
iter.num
iteration number used in fast randomized algorithm
verbose
if TRUE, show information
Details
The minor allele frequency and missing rate for each SNP passed in
snp.id
are calculated over all the samples in sample.id
.
Value
Return a snpgdsPCAClass
object, and it is a list:
sample.id
the sample ids used in the analysis
snp.id
the SNP ids used in the analysis
eigenval
eigenvalues
eigenvect
eigenvactors, "# of samples" x "eigen.cnt"
varprop
variance proportion for each principal component
TraceXTX
the trace of the genetic covariance matrix
Bayesian
whether use bayerisan normalization
genmat
the genetic covariance matrix
Author(s)
Xiuwen Zheng
References
Patterson N, Price AL, Reich D. Population structure and eigenanalysis.
PLoS Genet. 2006 Dec;2(12):e190.
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution of
ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
See Also
snpgdsPCACorr
, snpgdsPCASampLoading
,
snpgdsPCASNPLoading
Examples
# open an example dataset (HapMap)
genofile <- snpgdsOpen(snpgdsExampleFileName())
# run PCA
RV <- snpgdsPCA(genofile)
# eigenvalues
head(RV$eigenval)
# variance proportion (%)
head(round(RV$varprop*100, 2))
# [1] 12.23 5.84 1.01 0.95 0.84 0.74
#### there is no population information ####
# make a data.frame
tab <- data.frame(sample.id = RV$sample.id,
EV1 = RV$eigenvect[,1], # the first eigenvector
EV2 = RV$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
head(tab)
# sample.id EV1 EV2
# 1 NA19152 -0.08411287 -0.01226860
# 2 NA19139 -0.08360644 -0.01085849
# 3 NA18912 -0.08110808 -0.01184524
# 4 NA19160 -0.08680864 -0.01447106
# 5 NA07034 0.03109761 0.07709255
# 6 NA07055 0.03228450 0.08155730
# draw
plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1")
#### there are population information ####
# get population information
# or pop_code <- scan("pop.txt", what=character())
# if it is stored in a text file "pop.txt"
pop_code <- read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))
# get sample id
samp.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
# assume the order of sample IDs is as the same as population codes
cbind(samp.id, pop_code)
# samp.id pop_code
# [1,] "NA19152" "YRI"
# [2,] "NA19139" "YRI"
# [3,] "NA18912" "YRI"
# [4,] "NA19160" "YRI"
# [5,] "NA07034" "CEU"
# ...
# make a data.frame
tab <- data.frame(sample.id = RV$sample.id,
pop = factor(pop_code)[match(RV$sample.id, samp.id)],
EV1 = RV$eigenvect[,1], # the first eigenvector
EV2 = RV$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
head(tab)
# sample.id pop EV1 EV2
# 1 NA19152 YRI -0.08411287 -0.01226860
# 2 NA19139 YRI -0.08360644 -0.01085849
# 3 NA18912 YRI -0.08110808 -0.01184524
# 4 NA19160 YRI -0.08680864 -0.01447106
# 5 NA07034 CEU 0.03109761 0.07709255
# 6 NA07055 CEU 0.03228450 0.08155730
# draw
plot(tab$EV2, tab$EV1, col=as.integer(tab$pop),
xlab="eigenvector 2", ylab="eigenvector 1")
legend("bottomright", legend=levels(tab$pop), pch="o", col=1:4)
# close the file
snpgdsClose(genofile)
Results
R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library(SNPRelate)
Loading required package: gdsfmt
SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)
> png(filename="/home/ddbj/snapshot/RGM3/R_BC/result/SNPRelate/snpgdsPCA.Rd_%03d_medium.png", width=480, height=480)
> ### Name: snpgdsPCA
> ### Title: Principal Component Analysis (PCA) on SNP genotype data
> ### Aliases: snpgdsPCA
> ### Keywords: PCA GDS GWAS
>
> ### ** Examples
>
> # open an example dataset (HapMap)
> genofile <- snpgdsOpen(snpgdsExampleFileName())
>
> # run PCA
> RV <- snpgdsPCA(genofile)
Principal Component Analysis (PCA) on SNP genotypes:
Excluding 365 SNPs on non-autosomes
Excluding 1 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
Working space: 279 samples, 8722 SNPs
using 1 (CPU) core
PCA: the sum of all selected genotypes (0, 1 and 2) = 2446510
Wed Jul 6 05:34:46 2016 (internal increment: 1744)
[>.................................................] 0%, ETC: NA [==========>.......................................] 20%, ETC: 0s [====================>.............................] 40%, ETC: 0s [==============================>...................] 60%, ETC: 0s [========================================>.........] 80%, ETC: 0s [==================================================] 100%, ETC: 0s [==================================================] 100%, completed
Wed Jul 6 05:34:47 2016 Begin (eigenvalues and eigenvectors)
Wed Jul 6 05:34:47 2016 Done.
>
> # eigenvalues
> head(RV$eigenval)
[1] 33.985988 16.234579 2.810976 2.634050 2.345686 2.047077
>
> # variance proportion (%)
> head(round(RV$varprop*100, 2))
[1] 12.23 5.84 1.01 0.95 0.84 0.74
> # [1] 12.23 5.84 1.01 0.95 0.84 0.74
>
>
> #### there is no population information ####
>
> # make a data.frame
> tab <- data.frame(sample.id = RV$sample.id,
+ EV1 = RV$eigenvect[,1], # the first eigenvector
+ EV2 = RV$eigenvect[,2], # the second eigenvector
+ stringsAsFactors = FALSE)
> head(tab)
sample.id EV1 EV2
1 NA19152 -0.08411287 -0.01226860
2 NA19139 -0.08360644 -0.01085849
3 NA18912 -0.08110808 -0.01184524
4 NA19160 -0.08680864 -0.01447106
5 NA07034 0.03109761 0.07709255
6 NA07055 0.03228450 0.08155730
> # sample.id EV1 EV2
> # 1 NA19152 -0.08411287 -0.01226860
> # 2 NA19139 -0.08360644 -0.01085849
> # 3 NA18912 -0.08110808 -0.01184524
> # 4 NA19160 -0.08680864 -0.01447106
> # 5 NA07034 0.03109761 0.07709255
> # 6 NA07055 0.03228450 0.08155730
>
> # draw
> plot(tab$EV2, tab$EV1, xlab="eigenvector 2", ylab="eigenvector 1")
>
>
>
> #### there are population information ####
>
> # get population information
> # or pop_code <- scan("pop.txt", what=character())
> # if it is stored in a text file "pop.txt"
> pop_code <- read.gdsn(index.gdsn(genofile, "sample.annot/pop.group"))
>
> # get sample id
> samp.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
>
> # assume the order of sample IDs is as the same as population codes
> cbind(samp.id, pop_code)
samp.id pop_code
[1,] "NA19152" "YRI"
[2,] "NA19139" "YRI"
[3,] "NA18912" "YRI"
[4,] "NA19160" "YRI"
[5,] "NA07034" "CEU"
[6,] "NA07055" "CEU"
[7,] "NA12814" "CEU"
[8,] "NA10847" "CEU"
[9,] "NA18532" "HCB"
[10,] "NA18561" "HCB"
[11,] "NA18942" "JPT"
[12,] "NA18940" "JPT"
[13,] "NA18515" "YRI"
[14,] "NA19222" "YRI"
[15,] "NA18508" "YRI"
[16,] "NA19129" "YRI"
[17,] "NA12056" "CEU"
[18,] "NA10863" "CEU"
[19,] "NA10857" "CEU"
[20,] "NA12006" "CEU"
[21,] "NA18605" "HCB"
[22,] "NA18542" "HCB"
[23,] "NA18945" "JPT"
[24,] "NA18949" "JPT"
[25,] "NA19238" "YRI"
[26,] "NA19194" "YRI"
[27,] "NA19142" "YRI"
[28,] "NA18913" "YRI"
[29,] "NA12145" "CEU"
[30,] "NA12763" "CEU"
[31,] "NA07357" "CEU"
[32,] "NA12144" "CEU"
[33,] "NA18550" "HCB"
[34,] "NA18608" "HCB"
[35,] "NA18964" "JPT"
[36,] "NA18953" "JPT"
[37,] "NA19154" "YRI"
[38,] "NA19138" "YRI"
[39,] "NA18852" "YRI"
[40,] "NA19120" "YRI"
[41,] "NA07019" "CEU"
[42,] "NA10831" "CEU"
[43,] "NA07000" "CEU"
[44,] "NA11832" "CEU"
[45,] "NA18564" "HCB"
[46,] "NA18961" "JPT"
[47,] "NA18972" "JPT"
[48,] "NA19210" "YRI"
[49,] "NA19204" "YRI"
[50,] "NA18507" "YRI"
[51,] "NA19159" "YRI"
[52,] "NA06991" "CEU"
[53,] "NA11840" "CEU"
[54,] "NA12802" "CEU"
[55,] "NA12813" "CEU"
[56,] "NA18571" "HCB"
[57,] "NA18620" "HCB"
[58,] "NA18967" "JPT"
[59,] "NA18976" "JPT"
[60,] "NA19211" "YRI"
[61,] "NA18516" "YRI"
[62,] "NA18523" "YRI"
[63,] "NA12761" "CEU"
[64,] "NA10830" "CEU"
[65,] "NA10855" "CEU"
[66,] "NA18623" "HCB"
[67,] "NA18576" "HCB"
[68,] "NA18981" "JPT"
[69,] "NA18971" "JPT"
[70,] "NA18862" "YRI"
[71,] "NA19205" "YRI"
[72,] "NA19101" "YRI"
[73,] "NA19102" "YRI"
[74,] "NA06994" "CEU"
[75,] "NA11993" "CEU"
[76,] "NA11995" "CEU"
[77,] "NA12891" "CEU"
[78,] "NA18582" "HCB"
[79,] "NA18633" "HCB"
[80,] "NA18994" "JPT"
[81,] "NA18872" "YRI"
[82,] "NA19192" "YRI"
[83,] "NA19172" "YRI"
[84,] "NA19092" "YRI"
[85,] "NA12864" "CEU"
[86,] "NA12751" "CEU"
[87,] "NA10839" "CEU"
[88,] "NA12717" "CEU"
[89,] "NA18637" "HCB"
[90,] "NA18594" "HCB"
[91,] "NA18998" "JPT"
[92,] "NA19000" "JPT"
[93,] "NA12753" "CEU"
[94,] "NA12707" "CEU"
[95,] "NA11839" "CEU"
[96,] "NA10859" "CEU"
[97,] "NA18526" "HCB"
[98,] "NA18524" "HCB"
[99,] "NA18529" "HCB"
[100,] "NA18558" "HCB"
[101,] "NA18502" "YRI"
[102,] "NA19145" "YRI"
[103,] "NA18505" "YRI"
[104,] "NA18856" "YRI"
[105,] "NA12875" "CEU"
[106,] "NA12156.dup" "CEU"
[107,] "NA12156" "CEU"
[108,] "NA07056" "CEU"
[109,] "NA18562" "HCB"
[110,] "NA18537" "HCB"
[111,] "NA18603" "HCB"
[112,] "NA18540" "HCB"
[113,] "NA19153" "YRI"
[114,] "NA19137" "YRI"
[115,] "NA19202" "YRI"
[116,] "NA19239" "YRI"
[117,] "NA12044" "CEU"
[118,] "NA11992" "CEU"
[119,] "NA11829" "CEU"
[120,] "NA07022" "CEU"
[121,] "NA18545" "HCB"
[122,] "NA18572" "HCB"
[123,] "NA18547" "HCB"
[124,] "NA18609.dup" "HCB"
[125,] "NA18857" "YRI"
[126,] "NA19238.dup" "YRI"
[127,] "NA18501" "YRI"
[128,] "NA19240" "YRI"
[129,] "NA06993.dup" "CEU"
[130,] "NA12239" "CEU"
[131,] "NA12154" "CEU"
[132,] "NA12762" "CEU"
[133,] "NA18609" "HCB"
[134,] "NA18552" "HCB"
[135,] "NA18611" "HCB"
[136,] "NA18555" "HCB"
[137,] "NA19223" "YRI"
[138,] "NA18500" "YRI"
[139,] "NA18861" "YRI"
[140,] "NA12716" "CEU"
[141,] "NA12878" "CEU"
[142,] "NA10856" "CEU"
[143,] "NA12874" "CEU"
[144,] "NA18566" "HCB"
[145,] "NA18563" "HCB"
[146,] "NA18570" "HCB"
[147,] "NA18612" "HCB"
[148,] "NA19201" "YRI"
[149,] "NA19144" "YRI"
[150,] "NA19193" "YRI"
[151,] "NA18503" "YRI"
[152,] "NA12760" "CEU"
[153,] "NA11993.dup" "CEU"
[154,] "NA06985" "CEU"
[155,] "NA12003" "CEU"
[156,] "NA18621" "HCB"
[157,] "NA18594.dup" "HCB"
[158,] "NA18622" "HCB"
[159,] "NA18573" "HCB"
[160,] "NA18504" "YRI"
[161,] "NA19203" "YRI"
[162,] "NA19143" "YRI"
[163,] "NA18871" "YRI"
[164,] "NA07348" "CEU"
[165,] "NA12750" "CEU"
[166,] "NA11831" "CEU"
[167,] "NA10835" "CEU"
[168,] "NA18577" "HCB"
[169,] "NA18624" "HCB"
[170,] "NA18579" "HCB"
[171,] "NA18632" "HCB"
[172,] "NA18870" "YRI"
[173,] "NA19200" "YRI"
[174,] "NA18517" "YRI"
[175,] "NA19221" "YRI"
[176,] "NA12146" "CEU"
[177,] "NA11882" "CEU"
[178,] "NA18635" "HCB"
[179,] "NA18592" "HCB"
[180,] "NA18636" "HCB"
[181,] "NA18593" "HCB"
[182,] "NA18863" "YRI"
[183,] "NA18855" "YRI"
[184,] "NA18862.dup" "YRI"
[185,] "NA19209" "YRI"
[186,] "NA18951.dup" "JPT"
[187,] "NA18943" "JPT"
[188,] "NA18947" "JPT"
[189,] "NA18944" "JPT"
[190,] "NA18521" "YRI"
[191,] "NA18858" "YRI"
[192,] "NA19141" "YRI"
[193,] "NA19093" "YRI"
[194,] "NA10861" "CEU"
[195,] "NA12005" "CEU"
[196,] "NA10851" "CEU"
[197,] "NA12234" "CEU"
[198,] "NA18948" "JPT"
[199,] "NA18951" "JPT"
[200,] "NA18952" "JPT"
[201,] "NA18956" "JPT"
[202,] "NA19099" "YRI"
[203,] "NA18854" "YRI"
[204,] "NA19132" "YRI"
[205,] "NA12004" "CEU"
[206,] "NA07345" "CEU"
[207,] "NA07029" "CEU"
[208,] "NA12892" "CEU"
[209,] "NA18968" "JPT"
[210,] "NA18959" "JPT"
[211,] "NA18969" "JPT"
[212,] "NA18960" "JPT"
[213,] "NA19206" "YRI"
[214,] "NA18506" "YRI"
[215,] "NA19098" "YRI"
[216,] "NA19130" "YRI"
[217,] "NA07048" "CEU"
[218,] "NA10854" "CEU"
[219,] "NA12248" "CEU"
[220,] "NA10846" "CEU"
[221,] "NA18965" "JPT"
[222,] "NA18973" "JPT"
[223,] "NA18966" "JPT"
[224,] "NA18975" "JPT"
[225,] "NA19128" "YRI"
[226,] "NA19131" "YRI"
[227,] "NA18522" "YRI"
[228,] "NA19208" "YRI"
[229,] "NA12801" "CEU"
[230,] "NA12872" "CEU"
[231,] "NA12155" "CEU"
[232,] "NA06993" "CEU"
[233,] "NA18978" "JPT"
[234,] "NA18970" "JPT"
[235,] "NA18980" "JPT"
[236,] "NA18995.dup" "JPT"
[237,] "NA18859.dup" "YRI"
[238,] "NA19116" "YRI"
[239,] "NA18914" "YRI"
[240,] "NA19127" "YRI"
[241,] "NA11830" "CEU"
[242,] "NA12865" "CEU"
[243,] "NA10838" "CEU"
[244,] "NA12249" "CEU"
[245,] "NA18974" "JPT"
[246,] "NA18987" "JPT"
[247,] "NA18990" "JPT"
[248,] "NA18991" "JPT"
[249,] "NA19094" "YRI"
[250,] "NA19161" "YRI"
[251,] "NA18853" "YRI"
[252,] "NA19173" "YRI"
[253,] "NA12057" "CEU"
[254,] "NA10860" "CEU"
[255,] "NA12812" "CEU"
[256,] "NA11881" "CEU"
[257,] "NA18992" "JPT"
[258,] "NA18995" "JPT"
[259,] "NA18997" "JPT"
[260,] "NA19012" "JPT"
[261,] "NA19171" "YRI"
[262,] "NA19140" "YRI"
[263,] "NA18859" "YRI"
[264,] "NA19119" "YRI"
[265,] "NA11994" "CEU"
[266,] "NA12873" "CEU"
[267,] "NA12815" "CEU"
[268,] "NA19005" "JPT"
[269,] "NA18999" "JPT"
[270,] "NA19007" "JPT"
[271,] "NA19003" "JPT"
[272,] "NA18860" "YRI"
[273,] "NA19207" "YRI"
[274,] "NA19100" "YRI"
[275,] "NA19103" "YRI"
[276,] "NA12740" "CEU"
[277,] "NA12752" "CEU"
[278,] "NA12043" "CEU"
[279,] "NA12264" "CEU"
> # samp.id pop_code
> # [1,] "NA19152" "YRI"
> # [2,] "NA19139" "YRI"
> # [3,] "NA18912" "YRI"
> # [4,] "NA19160" "YRI"
> # [5,] "NA07034" "CEU"
> # ...
>
> # make a data.frame
> tab <- data.frame(sample.id = RV$sample.id,
+ pop = factor(pop_code)[match(RV$sample.id, samp.id)],
+ EV1 = RV$eigenvect[,1], # the first eigenvector
+ EV2 = RV$eigenvect[,2], # the second eigenvector
+ stringsAsFactors = FALSE)
> head(tab)
sample.id pop EV1 EV2
1 NA19152 YRI -0.08411287 -0.01226860
2 NA19139 YRI -0.08360644 -0.01085849
3 NA18912 YRI -0.08110808 -0.01184524
4 NA19160 YRI -0.08680864 -0.01447106
5 NA07034 CEU 0.03109761 0.07709255
6 NA07055 CEU 0.03228450 0.08155730
> # sample.id pop EV1 EV2
> # 1 NA19152 YRI -0.08411287 -0.01226860
> # 2 NA19139 YRI -0.08360644 -0.01085849
> # 3 NA18912 YRI -0.08110808 -0.01184524
> # 4 NA19160 YRI -0.08680864 -0.01447106
> # 5 NA07034 CEU 0.03109761 0.07709255
> # 6 NA07055 CEU 0.03228450 0.08155730
>
> # draw
> plot(tab$EV2, tab$EV1, col=as.integer(tab$pop),
+ xlab="eigenvector 2", ylab="eigenvector 1")
> legend("bottomright", legend=levels(tab$pop), pch="o", col=1:4)
>
>
> # close the file
> snpgdsClose(genofile)
>
>
>
>
>
> dev.off()
null device
1
>