Last data update: 2014.03.03

R: Determine clusters of individuals
snpgdsCutTreeR Documentation

Determine clusters of individuals

Description

To determine sub groups of individuals using a specified dendrogram from hierarchical cluster analysis

Usage

snpgdsCutTree(hc, z.threshold=15, outlier.n=5, n.perm = 5000, samp.group=NULL,
    col.outlier="red", col.list=NULL, pch.outlier=4, pch.list=NULL,
    label.H=FALSE, label.Z=TRUE, verbose=TRUE)

Arguments

hc

an object of snpgdsHCluster

z.threshold

the threshold of Z score to determine whether split the node or not

outlier.n

the cluster with size less than or equal to outlier.n is considered as outliers

n.perm

the times for permutation

samp.group

if NULL, determine groups by Z score; if a vector of factor, assign each individual in dendrogram with respect to samp.group

col.outlier

the color of outlier

col.list

the list of colors for different clusters

pch.outlier

plotting 'character' for outliers

pch.list

plotting 'character' for different clusters

label.H

if TRUE, plotting heights in a dendrogram

label.Z

if TRUE, plotting Z scores in a dendrogram

verbose

if TRUE, show information

Details

The details will be described in future.

Value

Return a list:

sample.id

the sample ids used in the analysis

z.threshold

the threshold of Z score to determine whether split the node or not

outlier.n

the cluster with size less than or equal to outlier.n is considered as outliers

samp.order

the order of samples in the dendrogram

samp.group

a vector of factor, indicating the group of each individual

dmat

a matrix of pairwise group dissimilarity

dendrogram

the dendrogram of individuals

merge

a data.frame of (z, n1, n2) describing each combination: z, the Z score; n1, the size of the first cluster; n2, the size of the second cluster

clust.count

the counts for clusters

Author(s)

Xiuwen Zheng

See Also

snpgdsHCluster, snpgdsDrawTree, snpgdsIBS, snpgdsDiss

Examples

# open an example dataset (HapMap)
genofile <- snpgdsOpen(snpgdsExampleFileName())

pop.group <- as.factor(read.gdsn(index.gdsn(
    genofile, "sample.annot/pop.group")))
pop.level <- levels(pop.group)

diss <- snpgdsDiss(genofile)
hc <- snpgdsHCluster(diss)

# close the genotype file
snpgdsClose(genofile)



###################################################################
# cluster individuals
#

set.seed(100)
rv <- snpgdsCutTree(hc, label.H=TRUE, label.Z=TRUE)

# the distribution of Z scores
snpgdsDrawTree(rv, type="z-score", main="HapMap Phase II")

# draw dendrogram
snpgdsDrawTree(rv, main="HapMap Phase II",
    edgePar=list(col=rgb(0.5,0.5,0.5, 0.75), t.col="black"))


###################################################################
# or cluster individuals by ethnic information
#

rv2 <- snpgdsCutTree(hc, samp.group=pop.group)

# cluster individuals by Z score, specifying 'clust.count'
snpgdsDrawTree(rv2, rv$clust.count, main="HapMap Phase II",
    edgePar = list(col=rgb(0.5,0.5,0.5, 0.75), t.col="black"),
    labels = c("YRI", "CHB/JPT", "CEU"), y.label=0.1)
legend("bottomleft", legend=levels(pop.group), col=1:nlevels(pop.group),
    pch=19, ncol=4, bg="white")



###################################################################
# zoom in ...
#

snpgdsDrawTree(rv2, rv$clust.count, dend.idx = c(1),
    main="HapMap Phase II -- YRI",
    edgePar=list(col=rgb(0.5,0.5,0.5, 0.75), t.col="black"),
    y.label.kinship=TRUE)

snpgdsDrawTree(rv2, rv$clust.count, dend.idx = c(2,2),
    main="HapMap Phase II -- CEU",
    edgePar=list(col=rgb(0.5,0.5,0.5, 0.75), t.col="black"),
    y.label.kinship=TRUE)

snpgdsDrawTree(rv2, rv$clust.count, dend.idx = c(2,1),
    main="HapMap Phase II -- CHB/JPT",
    edgePar=list(col=rgb(0.5,0.5,0.5, 0.75), t.col="black"),
    y.label.kinship=TRUE)

Results


R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> library(SNPRelate)
Loading required package: gdsfmt
SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)
> png(filename="/home/ddbj/snapshot/RGM3/R_BC/result/SNPRelate/snpgdsCutTree.Rd_%03d_medium.png", width=480, height=480)
> ### Name: snpgdsCutTree
> ### Title: Determine clusters of individuals
> ### Aliases: snpgdsCutTree
> ### Keywords: GDS GWAS
> 
> ### ** Examples
> 
> # open an example dataset (HapMap)
> genofile <- snpgdsOpen(snpgdsExampleFileName())
> 
> pop.group <- as.factor(read.gdsn(index.gdsn(
+     genofile, "sample.annot/pop.group")))
> pop.level <- levels(pop.group)
> 
> diss <- snpgdsDiss(genofile)
Individual dissimilarity analysis on SNP genotypes:
Excluding 365 SNPs on non-autosomes
Excluding 1 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
Working space: 279 samples, 8722 SNPs
	using 1 (CPU) core
Dissimilarity:	the sum of all selected genotypes (0, 1 and 2) = 2446510
Dissimilarity:	Wed Jul  6 05:34:32 2016	0%
Dissimilarity:	Wed Jul  6 05:34:33 2016	100%
> hc <- snpgdsHCluster(diss)
> 
> # close the genotype file
> snpgdsClose(genofile)
> 
> 
> 
> ###################################################################
> # cluster individuals
> #
> 
> set.seed(100)
> rv <- snpgdsCutTree(hc, label.H=TRUE, label.Z=TRUE)
Determine groups by permutation (Z threshold: 15, outlier threshold: 5):
Create 3 groups.
> 
> # the distribution of Z scores
> snpgdsDrawTree(rv, type="z-score", main="HapMap Phase II")
> 
> # draw dendrogram
> snpgdsDrawTree(rv, main="HapMap Phase II",
+     edgePar=list(col=rgb(0.5,0.5,0.5, 0.75), t.col="black"))
> 
> 
> ###################################################################
> # or cluster individuals by ethnic information
> #
> 
> rv2 <- snpgdsCutTree(hc, samp.group=pop.group)
Create 4 groups.
> 
> # cluster individuals by Z score, specifying 'clust.count'
> snpgdsDrawTree(rv2, rv$clust.count, main="HapMap Phase II",
+     edgePar = list(col=rgb(0.5,0.5,0.5, 0.75), t.col="black"),
+     labels = c("YRI", "CHB/JPT", "CEU"), y.label=0.1)
> legend("bottomleft", legend=levels(pop.group), col=1:nlevels(pop.group),
+     pch=19, ncol=4, bg="white")
> 
> 
> 
> ###################################################################
> # zoom in ...
> #
> 
> snpgdsDrawTree(rv2, rv$clust.count, dend.idx = c(1),
+     main="HapMap Phase II -- YRI",
+     edgePar=list(col=rgb(0.5,0.5,0.5, 0.75), t.col="black"),
+     y.label.kinship=TRUE)
> 
> snpgdsDrawTree(rv2, rv$clust.count, dend.idx = c(2,2),
+     main="HapMap Phase II -- CEU",
+     edgePar=list(col=rgb(0.5,0.5,0.5, 0.75), t.col="black"),
+     y.label.kinship=TRUE)
> 
> snpgdsDrawTree(rv2, rv$clust.count, dend.idx = c(2,1),
+     main="HapMap Phase II -- CHB/JPT",
+     edgePar=list(col=rgb(0.5,0.5,0.5, 0.75), t.col="black"),
+     y.label.kinship=TRUE)
> 
> 
> 
> 
> 
> dev.off()
null device 
          1 
>