Last data update: 2014.03.03

R: Calculate PREBS values
calc_prebsR Documentation

Calculate PREBS values

Description

calc_prebs calculates PREBS values for given set of BAM files.

Usage

calc_prebs(bam_files, probe_mapping_file, cdf_name = NULL, cluster = NULL,
  output_eset = TRUE, paired_ended_reads = FALSE, ignore_strand = TRUE,
  sum.method = "rpa")

Arguments

bam_files

A vector containing .bam files.

probe_mapping_file

A file containing probe mappings in the genome.

cdf_name

A name of CDF package to use in RMA algorithm. If cdf_name=NULL, the package name is inferred from the name of probe_mapping_file ("HGU133Plus2_Hs_ENSG_mapping.txt" -> "hgu133plus2hsensgcdf")

cluster

A cluster object created using "makeCluster" function from "parellel" package. If cluster=NULL, no parallelization is used.

output_eset

If set to TRUE, the output of calc_prebs will be ExpressionSet object. Otherwise, the output will be a data frame.

paired_ended_reads

Set it to TRUE if your data contains paired-ended reads. Otherwise, the two read mates will be treated as independent units.

ignore_strand

If set to TRUE, then the strand is ignored while counting read overlaps with probe regions. If you use strand-specific RNA-seq protocol, set to FALSE, otherwise set it to TRUE.

sum.method

Microarray summarization method to be used. Can be either rpa or rma. The default mode is rpa.

Details

calc_prebs is the main function of prebs package that implements the whole pipeline. The function takes mapped reads in BAM format and probe sequence mappings as an input.

calc_prebs can run in two modes: rpa and rma. RMA is the classical microarray summarization algorithm developed by R. A. Irizarry et al. (2003), while RPA is a newer algorithm that was developed by L. Lahti et al. (2011). The default mode is rpa. NOTE: before prebs version 1.7.1 only RMA mode was available.

The output format depends on output_eset option. If output_eset=TRUE then calc_prebs returns ExpressionSet object (ExpressionSet object is defined in affy package). Otherwise, it returns a data frame containing PREBS values.

For running calc_prebs with custom CDF, the custom CDF package has to be downloaded and installed from Custom CDF website: http://brainarray.mbni.med.umich.edu/CustomCDF

For running calc_prebs with manufacturer's CDF, the manufacturer's CDF package can be installed from Bioconductor, for example: biocLite("GenomicRanges"); biocLite("hgu133plus2cdf")

For a detailed input specification, please refer to the prebs vignette.

Value

ExpressionSet object or a data frame containing PREBS values

Examples

if (require(prebsdata)) {
  # Get full paths to data files in code{prebsdata} package
  bam_file1 <- system.file(file.path("sample_bam_files", "input1.bam"), package="prebsdata")
  bam_file2 <- system.file(file.path("sample_bam_files", "input2.bam"), package="prebsdata")
  bam_files <- c(bam_file1, bam_file2)
  custom_cdf_mapping1 <- system.file(file.path("custom-cdf", "HGU133Plus2_Hs_ENSG_mapping.txt"),
                                     package="prebsdata")
  custom_cdf_mapping2 <- system.file(file.path("custom-cdf", "HGU133A2_Hs_ENSG_mapping.txt"),
                                     package="prebsdata")
  manufacturer_cdf_mapping <- system.file(file.path("manufacturer-cdf", "HGU133Plus2_mapping.txt"),
                                          package="prebsdata")
  if (interactive()) {
    # Run PREBS using custom CDF without parallelization ("rpa" mode)
    prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1)
    head(exprs(prebs_values))

    # Run PREBS using custom CDF without parallelization ("rma" mode)
    prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1, sum.method="rma")
    head(exprs(prebs_values))

    # Run PREBS using custom CDF with parallelization
    library(parallel)
    N_CORES = 2
    CLUSTER <- makeCluster(N_CORES)
    prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1, cluster=CLUSTER)
    stopCluster(CLUSTER)

    # Run PREBS using another custom CDF
    prebs_values <- calc_prebs(bam_files, custom_cdf_mapping2)

    # Run PREBS and return data frame instead of ExpressionSet object
    prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1, output_eset=FALSE)
    head(prebs_values)
  }

  # Run PREBS using Manufacturer's CDF (outputs probe set expressions)
  prebs_values <- calc_prebs(bam_files, manufacturer_cdf_mapping)
  head(exprs(prebs_values))

  # Same as above, but state CDF package name explicitly
  prebs_values <- calc_prebs(bam_files, manufacturer_cdf_mapping, cdf_name="hgu133plus2cdf")
}

Results


R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> library(prebs)
Loading required package: GenomicAlignments
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from 'package:stats':

    IQR, mad, xtabs

The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, cbind, colnames, do.call, duplicated, eval, evalq,
    get, grep, grepl, intersect, is.unsorted, lapply, lengths, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, rank,
    rbind, rownames, sapply, setdiff, sort, table, tapply, union,
    unique, unsplit

Loading required package: S4Vectors
Loading required package: stats4

Attaching package: 'S4Vectors'

The following objects are masked from 'package:base':

    colMeans, colSums, expand.grid, rowMeans, rowSums

Loading required package: IRanges
Loading required package: GenomeInfoDb
Loading required package: GenomicRanges
Loading required package: SummarizedExperiment
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.

Loading required package: Biostrings
Loading required package: XVector
Loading required package: Rsamtools
Loading required package: affy
Loading required package: RPA
Loading required package: phyloseq

Attaching package: 'phyloseq'

The following object is masked from 'package:affy':

    sampleNames

The following object is masked from 'package:SummarizedExperiment':

    distance

The following object is masked from 'package:Biobase':

    sampleNames

The following object is masked from 'package:GenomicRanges':

    distance

The following object is masked from 'package:IRanges':

    distance


RPA Copyright (C) 2008-2016 Leo Lahti.
This program comes with ABSOLUTELY NO WARRANTY.
This is free software, and you are welcome to redistribute it under the FreeBSD open source license.

> png(filename="/home/ddbj/snapshot/RGM3/R_BC/result/prebs/calc_prebs.Rd_%03d_medium.png", width=480, height=480)
> ### Name: calc_prebs
> ### Title: Calculate PREBS values
> ### Aliases: calc_prebs
> 
> ### ** Examples
> 
> if (require(prebsdata)) {
+   # Get full paths to data files in code{prebsdata} package
+   bam_file1 <- system.file(file.path("sample_bam_files", "input1.bam"), package="prebsdata")
+   bam_file2 <- system.file(file.path("sample_bam_files", "input2.bam"), package="prebsdata")
+   bam_files <- c(bam_file1, bam_file2)
+   custom_cdf_mapping1 <- system.file(file.path("custom-cdf", "HGU133Plus2_Hs_ENSG_mapping.txt"),
+                                      package="prebsdata")
+   custom_cdf_mapping2 <- system.file(file.path("custom-cdf", "HGU133A2_Hs_ENSG_mapping.txt"),
+                                      package="prebsdata")
+   manufacturer_cdf_mapping <- system.file(file.path("manufacturer-cdf", "HGU133Plus2_mapping.txt"),
+                                           package="prebsdata")
+ #  if (interactive()) {
+     # Run PREBS using custom CDF without parallelization ("rpa" mode)
+     prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1)
+     head(exprs(prebs_values))
+ 
+     # Run PREBS using custom CDF without parallelization ("rma" mode)
+     prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1, sum.method="rma")
+     head(exprs(prebs_values))
+ 
+     # Run PREBS using custom CDF with parallelization
+     library(parallel)
+     N_CORES = 2
+     CLUSTER <- makeCluster(N_CORES)
+     prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1, cluster=CLUSTER)
+     stopCluster(CLUSTER)
+ 
+     # Run PREBS using another custom CDF
+     prebs_values <- calc_prebs(bam_files, custom_cdf_mapping2)
+ 
+     # Run PREBS and return data frame instead of ExpressionSet object
+     prebs_values <- calc_prebs(bam_files, custom_cdf_mapping1, output_eset=FALSE)
+     head(prebs_values)
+ #  }
+ 
+   # Run PREBS using Manufacturer's CDF (outputs probe set expressions)
+   prebs_values <- calc_prebs(bam_files, manufacturer_cdf_mapping)
+   head(exprs(prebs_values))
+ 
+   # Same as above, but state CDF package name explicitly
+   prebs_values <- calc_prebs(bam_files, manufacturer_cdf_mapping, cdf_name="hgu133plus2cdf")
+ }
Loading required package: prebsdata
Inferred name for CDF package: HGU133Plus2_Hs_ENSG_mapping.txt -> hgu133plus2hsensgcdf

Error in check_input(bam_files, probe_mapping_file, cdf_name) : 
  CDF package `hgu133plus2hsensgcdf` is not installed. Please, install the package before running `calc_prebs`
Calls: calc_prebs -> check_input
Execution halted