Last data update: 2014.03.03

R: Parallel Computing Toolset for Genome-Wide Association...
SNPRelate-packageR Documentation

Parallel Computing Toolset for Genome-Wide Association Studies

Description

Genome-wide association studies are widely used to investigate the genetic basis of diseases and traits, but they pose many computational challenges. We developed SNPRelate (R package for multi-core symmetric multiprocessing computer architectures) to accelerate two key computations on SNP data: principal component analysis (PCA) and relatedness analysis using identity-by-descent measures. The kernels of our algorithms are written in C/C++ and highly optimized.

Details

Package: SNPRelate
Type: Package
License: GPL version 3
Depends: gdsfmt (>= 1.0.4)

The genotypes stored in GDS format can be analyzed by the R functions in SNPRelate, which utilize the multi-core feature of machine for a single computer.

Webpage: http://github.com/zhengxwen/SNPRelate, http://corearray.sourceforge.net/

Tutorial: http://corearray.sourceforge.net/tutorials/SNPRelate/

Author(s)

Xiuwen Zheng zhengxwen@gmail.com

References

Zheng X, Levine D, Shen J, Gogarten SM, Laurie C, Weir BS. A High-performance Computing Toolset for Relatedness and Principal Component Analysis of SNP Data. Bioinformatics (2012); doi: 10.1093/bioinformatics/bts610

Examples

####################################################################
# Convert the PLINK BED file to the GDS file
#

# PLINK BED files
bed.fn <- system.file("extdata", "plinkhapmap.bed.gz", package="SNPRelate")
fam.fn <- system.file("extdata", "plinkhapmap.fam.gz", package="SNPRelate")
bim.fn <- system.file("extdata", "plinkhapmap.bim.gz", package="SNPRelate")

# convert
snpgdsBED2GDS(bed.fn, fam.fn, bim.fn, "HapMap.gds")


####################################################################
# Principal Component Analysis
#

# open
genofile <- snpgdsOpen("HapMap.gds")

RV <- snpgdsPCA(genofile)
plot(RV$eigenvect[,2], RV$eigenvect[,1], xlab="PC 2", ylab="PC 1",
    col=rgb(0,0,150, 50, maxColorValue=255), pch=19)

# close the file
snpgdsClose(genofile)


####################################################################
# Identity-By-Descent (IBD) Analysis
#

# open
genofile <- snpgdsOpen(snpgdsExampleFileName())

RV <- snpgdsIBDMoM(genofile)
flag <- lower.tri(RV$k0)
plot(RV$k0[flag], RV$k1[flag], xlab="k0", ylab="k1",
    col=rgb(0,0,150, 50, maxColorValue=255), pch=19)
abline(1, -1, col="red", lty=4)

# close the file
snpgdsClose(genofile)


####################################################################
# Identity-By-State (IBS) Analysis
#

# open
genofile <- snpgdsOpen(snpgdsExampleFileName())

RV <- snpgdsIBS(genofile)
m <- 1 - RV$ibs
colnames(m) <- rownames(m) <- RV$sample.id
GeneticDistance <- as.dist(m[1:45, 1:45])
HC <- hclust(GeneticDistance, "ave")
plot(HC)

# close the file
snpgdsClose(genofile)


####################################################################
# Linkage Disequilibrium (LD) Analysis
#

# open an example dataset (HapMap)
genofile <- snpgdsOpen(snpgdsExampleFileName())

snpset <- read.gdsn(index.gdsn(genofile, "snp.id"))[1:200]
L1 <- snpgdsLDMat(genofile, snp.id=snpset, method="composite", slide=-1)

# plot
image(abs(L1$LD), col=terrain.colors(64))

# close the file
snpgdsClose(genofile)

Results


R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> library(SNPRelate)
Loading required package: gdsfmt
SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)
> png(filename="/home/ddbj/snapshot/RGM3/R_BC/result/SNPRelate/SNPRelate-package.Rd_%03d_medium.png", width=480, height=480)
> ### Name: SNPRelate-package
> ### Title: Parallel Computing Toolset for Genome-Wide Association Studies
> ### Aliases: SNPRelate-package SNPRelate
> ### Keywords: GDS GWAS
> 
> ### ** Examples
> 
> ####################################################################
> # Convert the PLINK BED file to the GDS file
> #
> 
> # PLINK BED files
> bed.fn <- system.file("extdata", "plinkhapmap.bed.gz", package="SNPRelate")
> fam.fn <- system.file("extdata", "plinkhapmap.fam.gz", package="SNPRelate")
> bim.fn <- system.file("extdata", "plinkhapmap.bim.gz", package="SNPRelate")
> 
> # convert
> snpgdsBED2GDS(bed.fn, fam.fn, bim.fn, "HapMap.gds")
Start snpgdsBED2GDS ...
	BED file: "/home/ddbj/local/lib64/R/library/SNPRelate/extdata/plinkhapmap.bed.gz" in the SNP-major mode (Sample X SNP)
	FAM file: "/home/ddbj/local/lib64/R/library/SNPRelate/extdata/plinkhapmap.fam.gz", DONE.
	BIM file: "/home/ddbj/local/lib64/R/library/SNPRelate/extdata/plinkhapmap.bim.gz", DONE.
Wed Jul  6 05:34:24 2016 	store sample id, snp id, position, and chromosome.
	start writing: 60 samples, 5000 SNPs ...
 	Wed Jul  6 05:34:24 2016	0%
 	Wed Jul  6 05:34:25 2016	100%
Wed Jul  6 05:34:25 2016 	Done.
Optimize the access efficiency ...
Clean up the fragments of GDS file:
    open the file 'HapMap.gds' (104.6K)
    # of fragments: 38
    save to 'HapMap.gds.tmp'
    rename 'HapMap.gds.tmp' (104.4K, reduced: 240B)
    # of fragments: 18
> 
> 
> ####################################################################
> # Principal Component Analysis
> #
> 
> # open
> genofile <- snpgdsOpen("HapMap.gds")
> 
> RV <- snpgdsPCA(genofile)
Principal Component Analysis (PCA) on SNP genotypes:
Excluding 203 SNPs on non-autosomes
Excluding 28 SNPs (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
Working space: 60 samples, 4769 SNPs
	using 1 (CPU) core
PCA:	the sum of all selected genotypes (0, 1 and 2) = 124273
Wed Jul  6 05:34:25 2016    (internal increment: 8120)
 [>.................................................]  0%, ETC: NA     [==================================================] 100%, completed  
Wed Jul  6 05:34:25 2016    Begin (eigenvalues and eigenvectors)
Wed Jul  6 05:34:25 2016    Done.
> plot(RV$eigenvect[,2], RV$eigenvect[,1], xlab="PC 2", ylab="PC 1",
+     col=rgb(0,0,150, 50, maxColorValue=255), pch=19)
> 
> # close the file
> snpgdsClose(genofile)
> 
> 
> ####################################################################
> # Identity-By-Descent (IBD) Analysis
> #
> 
> # open
> genofile <- snpgdsOpen(snpgdsExampleFileName())
> 
> RV <- snpgdsIBDMoM(genofile)
IBD analysis (PLINK method of moment) on SNP genotypes:
Excluding 365 SNPs on non-autosomes
Excluding 1 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
Working space: 279 samples, 8722 SNPs
	using 1 (CPU) core
PLINK IBD:	the sum of all selected genotypes (0, 1 and 2) = 2446510
Wed Jul  6 05:34:29 2016    (internal increment: 55808)
 [>.................................................]  0%, ETC: NA     [==================================================] 100%, completed  
Wed Jul  6 05:34:29 2016    Done.
> flag <- lower.tri(RV$k0)
> plot(RV$k0[flag], RV$k1[flag], xlab="k0", ylab="k1",
+     col=rgb(0,0,150, 50, maxColorValue=255), pch=19)
> abline(1, -1, col="red", lty=4)
> 
> # close the file
> snpgdsClose(genofile)
> 
> 
> ####################################################################
> # Identity-By-State (IBS) Analysis
> #
> 
> # open
> genofile <- snpgdsOpen(snpgdsExampleFileName())
> 
> RV <- snpgdsIBS(genofile)
Identity-By-State (IBS) analysis on SNP genotypes:
Excluding 365 SNPs on non-autosomes
Excluding 1 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
Working space: 279 samples, 8722 SNPs
	using 1 (CPU) core
IBS:	the sum of all selected genotypes (0, 1 and 2) = 2446510
Wed Jul  6 05:34:30 2016    (internal increment: 55808)
 [>.................................................]  0%, ETC: NA     [==================================================] 100%, completed  
Wed Jul  6 05:34:30 2016    Done.
> m <- 1 - RV$ibs
> colnames(m) <- rownames(m) <- RV$sample.id
> GeneticDistance <- as.dist(m[1:45, 1:45])
> HC <- hclust(GeneticDistance, "ave")
> plot(HC)
> 
> # close the file
> snpgdsClose(genofile)
> 
> 
> ####################################################################
> # Linkage Disequilibrium (LD) Analysis
> #
> 
> # open an example dataset (HapMap)
> genofile <- snpgdsOpen(snpgdsExampleFileName())
> 
> snpset <- read.gdsn(index.gdsn(genofile, "snp.id"))[1:200]
> L1 <- snpgdsLDMat(genofile, snp.id=snpset, method="composite", slide=-1)
Linkage Disequilibrium (LD) analysis on SNP genotypes:
Working space: 279 samples, 200 SNPs
	Using 1 (CPU) core.
LD matrix:	the sum of all selected genotypes (0, 1 and 2) = 55417
> 
> # plot
> image(abs(L1$LD), col=terrain.colors(64))
> 
> # close the file
> snpgdsClose(genofile)
> 
> 
> 
> 
> 
> dev.off()
null device 
          1 
>