Last data update: 2014.03.03
R: translate the GFF3 from a ciseqByCluster/processgff output...
translate the GFF3 from a ciseqByCluster/processgff output into
a serialized data.table instance, compute genome-wide plug-in FDR,
and update the GFF3 with this FDR
Description
translate the GFF3 from a ciseqByCluster/processgff output into
a serialized data.table instance, compute genome-wide plug-in FDR,
and update the GFF3 with this FDR
Usage
cgff2dt(gff3, tiling, addHitTest = TRUE, addcc878 = TRUE)
Arguments
gff3
character string naming a tabix-indexed, bgzipped output of
gffprocess
tiling
output of simpleTiling
addHitTest
logical, telling whether to add a column on coincidence of
SNP with the gwastagger
ranges
addcc878
logical, telling whether to add a column on coincidence of
SNP with the hmm878
ranges, using the inferred
chromatin state as factor level
Note
assumes unix utilites zcat, paste and bgzip are available
Examples
##---- Should be DIRECTLY executable !! ----
##-- ==> Define data, use random,
##-- or do help(data=index) for the standard data sets.
## The function is currently defined as
function (gff3, tiling)
{
require(foreach)
require(Rsamtools)
stopifnot(is(tiling, "GRanges"))
basen = gsub(".gff3.gz", "", gff3)
th = headerTabix(gff3)
orderedChr = th$seqnames
lgr = foreach(i = 1:length(tiling)) %dopar% {
gc()
cat(i)
lk = try(import.gff3(gff3, which = tiling[i]))
if (inherits(lk, "try-error"))
lk = NULL
if (!is.null(lk))
lk = as.data.table(as.data.frame(lk))
lk
}
bad = sapply(lgr, is.null)
if (any(bad))
lgr = lgr[-which(bad)]
ans = do.call(rbind, lgr)
ans$snplocs = as.numeric(ans$snplocs)
ans$ests = as.numeric(ans$ests)
ans$se = as.numeric(ans$se)
ans$oldfdr = as.numeric(ans$fdr)
ans$MAF = as.numeric(ans$MAF)
ans$dist.mid = as.numeric(ans$dist.mid)
nperm = length(grep("permS", names(ans)))
pnames = paste("permScore_", 1:nperm, sep = "")
for (i in 1:nperm) ans[[pnames[i]]] = as.numeric(ans[[pnames[i]]])
ans$mindist = as.numeric(ans$mindist)
print(date())
ans$fdr = pifdr(ans$score, c(ans$permScore_1, ans$permScore_2,
ans$permScore_3))
print(date())
obn = paste0(basen, "_dt")
assign(obn, ans)
save(list = obn, file = paste0(obn, ".rda"))
gwfdr = ans$fdr
gwch = ans$seqnames
gwsnp = ans$snp
gwprobeid = ans$probeid
sgwfdr = split(gwfdr, gwch)
sgwfdr = unlist(sgwfdr[orderedChr])
nn = tempfile()
fdrt = file(nn, "w")
writeLines(c(c(" ", " ", " "), paste("; gwfdr=", round(sgwfdr,
6), sep = "")), con = fdrt)
close(fdrt)
chkb = system(paste0("zcat ", gff3, " | paste -d '' - ",
nn, " | bgzip > ", gsub(".gff3", "wmlf.gff3", gff3)))
if (!(chkb == 0))
warning(paste("final system zcat-paste-bgzip returned non-zero: chkb=",
chkb))
invisible(chkb)
}
Results
R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library(GGtools)
Loading required package: GGBase
Loading required package: snpStats
Loading required package: survival
Loading required package: Matrix
Loading required package: data.table
Loading required package: parallel
Loading required package: Homo.sapiens
Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:parallel':
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
clusterExport, clusterMap, parApply, parCapply, parLapply,
parLapplyLB, parRapply, parSapply, parSapplyLB
The following objects are masked from 'package:stats':
IQR, mad, xtabs
The following objects are masked from 'package:base':
Filter, Find, Map, Position, Reduce, anyDuplicated, append,
as.data.frame, cbind, colnames, do.call, duplicated, eval, evalq,
get, grep, grepl, intersect, is.unsorted, lapply, lengths, mapply,
match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, rank,
rbind, rownames, sapply, setdiff, sort, table, tapply, union,
unique, unsplit
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: IRanges
Loading required package: S4Vectors
Attaching package: 'S4Vectors'
The following objects are masked from 'package:Matrix':
colMeans, colSums, expand, rowMeans, rowSums
The following objects are masked from 'package:base':
colMeans, colSums, expand.grid, rowMeans, rowSums
Attaching package: 'IRanges'
The following object is masked from 'package:data.table':
shift
Loading required package: OrganismDbi
Loading required package: GenomicFeatures
Loading required package: GenomeInfoDb
Loading required package: GenomicRanges
Loading required package: GO.db
Loading required package: org.Hs.eg.db
Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
Attaching package: 'GGtools'
The following object is masked from 'package:stats':
getCall
> png(filename="/home/ddbj/snapshot/RGM3/R_BC/result/GGtools/cgff2dt.Rd_%03d_medium.png", width=480, height=480)
> ### Name: cgff2dt
> ### Title: translate the GFF3 from a ciseqByCluster/processgff output into
> ### a serialized data.table instance, compute genome-wide plug-in FDR,
> ### and update the GFF3 with this FDR
> ### Aliases: cgff2dt
> ### Keywords: models
>
> ### ** Examples
>
> ##---- Should be DIRECTLY executable !! ----
> ##-- ==> Define data, use random,
> ##-- or do help(data=index) for the standard data sets.
>
> ## The function is currently defined as
> function (gff3, tiling)
+ {
+ require(foreach)
+ require(Rsamtools)
+ stopifnot(is(tiling, "GRanges"))
+ basen = gsub(".gff3.gz", "", gff3)
+ th = headerTabix(gff3)
+ orderedChr = th$seqnames
+ lgr = foreach(i = 1:length(tiling)) %dopar% {
+ gc()
+ cat(i)
+ lk = try(import.gff3(gff3, which = tiling[i]))
+ if (inherits(lk, "try-error"))
+ lk = NULL
+ if (!is.null(lk))
+ lk = as.data.table(as.data.frame(lk))
+ lk
+ }
+ bad = sapply(lgr, is.null)
+ if (any(bad))
+ lgr = lgr[-which(bad)]
+ ans = do.call(rbind, lgr)
+ ans$snplocs = as.numeric(ans$snplocs)
+ ans$ests = as.numeric(ans$ests)
+ ans$se = as.numeric(ans$se)
+ ans$oldfdr = as.numeric(ans$fdr)
+ ans$MAF = as.numeric(ans$MAF)
+ ans$dist.mid = as.numeric(ans$dist.mid)
+ nperm = length(grep("permS", names(ans)))
+ pnames = paste("permScore_", 1:nperm, sep = "")
+ for (i in 1:nperm) ans[[pnames[i]]] = as.numeric(ans[[pnames[i]]])
+ ans$mindist = as.numeric(ans$mindist)
+ print(date())
+ ans$fdr = pifdr(ans$score, c(ans$permScore_1, ans$permScore_2,
+ ans$permScore_3))
+ print(date())
+ obn = paste0(basen, "_dt")
+ assign(obn, ans)
+ save(list = obn, file = paste0(obn, ".rda"))
+ gwfdr = ans$fdr
+ gwch = ans$seqnames
+ gwsnp = ans$snp
+ gwprobeid = ans$probeid
+ sgwfdr = split(gwfdr, gwch)
+ sgwfdr = unlist(sgwfdr[orderedChr])
+ nn = tempfile()
+ fdrt = file(nn, "w")
+ writeLines(c(c(" ", " ", " "), paste("; gwfdr=", round(sgwfdr,
+ 6), sep = "")), con = fdrt)
+ close(fdrt)
+ chkb = system(paste0("zcat ", gff3, " | paste -d '' - ",
+ nn, " | bgzip > ", gsub(".gff3", "wmlf.gff3", gff3)))
+ if (!(chkb == 0))
+ warning(paste("final system zcat-paste-bgzip returned non-zero: chkb=",
+ chkb))
+ invisible(chkb)
+ }
function (gff3, tiling)
{
require(foreach)
require(Rsamtools)
stopifnot(is(tiling, "GRanges"))
basen = gsub(".gff3.gz", "", gff3)
th = headerTabix(gff3)
orderedChr = th$seqnames
lgr = foreach(i = 1:length(tiling)) %dopar% {
gc()
cat(i)
lk = try(import.gff3(gff3, which = tiling[i]))
if (inherits(lk, "try-error"))
lk = NULL
if (!is.null(lk))
lk = as.data.table(as.data.frame(lk))
lk
}
bad = sapply(lgr, is.null)
if (any(bad))
lgr = lgr[-which(bad)]
ans = do.call(rbind, lgr)
ans$snplocs = as.numeric(ans$snplocs)
ans$ests = as.numeric(ans$ests)
ans$se = as.numeric(ans$se)
ans$oldfdr = as.numeric(ans$fdr)
ans$MAF = as.numeric(ans$MAF)
ans$dist.mid = as.numeric(ans$dist.mid)
nperm = length(grep("permS", names(ans)))
pnames = paste("permScore_", 1:nperm, sep = "")
for (i in 1:nperm) ans[[pnames[i]]] = as.numeric(ans[[pnames[i]]])
ans$mindist = as.numeric(ans$mindist)
print(date())
ans$fdr = pifdr(ans$score, c(ans$permScore_1, ans$permScore_2,
ans$permScore_3))
print(date())
obn = paste0(basen, "_dt")
assign(obn, ans)
save(list = obn, file = paste0(obn, ".rda"))
gwfdr = ans$fdr
gwch = ans$seqnames
gwsnp = ans$snp
gwprobeid = ans$probeid
sgwfdr = split(gwfdr, gwch)
sgwfdr = unlist(sgwfdr[orderedChr])
nn = tempfile()
fdrt = file(nn, "w")
writeLines(c(c(" ", " ", " "), paste("; gwfdr=", round(sgwfdr,
6), sep = "")), con = fdrt)
close(fdrt)
chkb = system(paste0("zcat ", gff3, " | paste -d '' - ",
nn, " | bgzip > ", gsub(".gff3", "wmlf.gff3", gff3)))
if (!(chkb == 0))
warning(paste("final system zcat-paste-bgzip returned non-zero: chkb=",
chkb))
invisible(chkb)
}
>
>
>
>
>
> dev.off()
null device
1
>