From ae5a59efee3b414f66416977afffab44e3248237 Mon Sep 17 00:00:00 2001 From: Roleren Date: Mon, 14 Sep 2020 11:58:13 +0200 Subject: [PATCH 1/2] updated coverageByTranscript with weights --- .gitignore | 3 + DESCRIPTION | 2 +- R/coverageByTranscript.R | 107 ++++++++++++++++++++++++++++++++++++ man/coverageByTranscript.Rd | 9 ++- 4 files changed, 119 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 7b73294a..d8e9aaf8 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ /R/.Rhistory +.Rproj.user +GenomicFeatures.Rproj +.Rbuildignore diff --git a/DESCRIPTION b/DESCRIPTION index e4c11184..87770baa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: GenomicFeatures Title: Conveniently import and query gene models -Version: 1.41.3 +Version: 1.41.4 Encoding: UTF-8 Author: M. Carlson, H. Pagès, P. Aboyoun, S. Falcon, M. Morgan, D. Sarkar, M. Lawrence, V. Obenchain diff --git a/R/coverageByTranscript.R b/R/coverageByTranscript.R index 945c4234..44a201f7 100644 --- a/R/coverageByTranscript.R +++ b/R/coverageByTranscript.R @@ -127,6 +127,113 @@ coverageByTranscript <- function(x, transcripts, ignore.strand=FALSE) mcols(ans) <- mcols(transcripts) ans } +### Computing coverage by transcript (or CDS) of a set of ranges is an +### operation that feels a lot like extracting transcript sequences from a +### genome. Defined as an ordinary function for now. +coverageByTranscript <- function (x, transcripts, ignore.strand = FALSE, + weight = 1L) { + if (!is(transcripts, "GRangesList")) { + transcripts <- try(exonsBy(transcripts, by = "tx", use.names = TRUE), + silent = TRUE) + if (is(transcripts, "try-error")) + stop(wmsg("failed to extract the exon ranges ", + "from 'transcripts' with ", "exonsBy(transcripts, by=\"tx\", use.names=TRUE)")) + } + if (!isTRUEorFALSE(ignore.strand)) + stop(wmsg("'ignore.strand' must be TRUE or FALSE")) + ## STEP 1 - Infer seqlengths that are missing from 'x' and 'transcripts'. + + ## This will guarantee that subsetting the named RleList object + ## representing the read coverage by the GRanges object representing + ## the set of unique exons won't fail due to out-of-bounds ranges in + ## the subscript. See STEP 3 below. + seqinfo(x) <- .merge_seqinfo_and_infer_missing_seqlengths(x, transcripts) + + ## STEP 2 - Compute unique exons ('uex'). + + ex <- unlist(transcripts, use.names=FALSE) + ## We could simply do 'uex <- unique(ex)' here but we're going to need + ## 'sm' and 'is_unique' later to compute the "reverse index" so we compute + ## them now and use them to extract the unique exons. That way we hash + ## 'ex' only once (the expensive operation). + sm <- selfmatch(ex) # uses a hash table internally + is_unique <- sm == seq_along(sm) + uex2ex <- which(is_unique) # index of unique exons + uex <- ex[uex2ex] # unique exons + + ## STEP 3 - Compute coverage for each unique exon ('uex_cvg'). + + #There doesn't seem to be much benefit in doing this. + #x <- subsetByOverlaps(x, transcripts, ignore.strand=TRUE) + + ## Fix GAlignments not allowing mcol weight, remove when they fix it + ## in GAlignments definition of coverage. + ## Because we've inferred the seqlengths that are missing from 'x' + ## and 'transcripts' (see STEP 1 above), 'uex' should never contain + ## out-of-bounds ranges i.e. the list elements in 'cvg' should always + ## be Rle objects that are long enough with respect to the ranges + ## in 'uex'. + if ((is(x, "GAlignments") | is(x, "GAlignmentPairs")) + & is.character(weight)) { + if (!(weight %in% colnames(mcols(x)))) + stop("weight is character and not mcol of x,", + " check spelling of weight.") + weight <- mcols(x)[, weight] + x <- grglist(x) # convert to grl + weight = weight[groupings(x)] # repeat weight per group + } + + if (ignore.strand) { + cvg <- coverage(x, weight = weight) + uex_cvg <- cvg[uex] + } + else { + pluss <- BiocGenerics::`%in%`(strand(x), c("+", "*")) + minus <- BiocGenerics::`%in%`(strand(x), c("-", "*")) + x1 <- x[pluss] + x2 <- x[minus] + if (length(weight) > 1) { + # Add unlist in case of GAlignments + cvg1 <- coverage(x1, weight = weight[as.logical(unlist(pluss))]) + cvg2 <- coverage(x2, weight = weight[as.logical(unlist(minus))]) + } else { + cvg1 <- coverage(x1, weight = weight) + cvg2 <- coverage(x2, weight = weight) + } + + is_plus_ex <- strand(uex) == "+" + is_minus_ex <- strand(uex) == "-" + if (!identical(is_plus_ex, !is_minus_ex)) + stop(wmsg("'transcripts' has exons on the * strand. ", + "This is not supported at the moment.")) + uex_cvg <- cvg1[uex] + uex_cvg[is_minus_ex] <- cvg2[uex[is_minus_ex]] + } + ## STEP 4 - Flip coverage for exons on minus strand. + + ## It feels like this is not as fast as it could be (the bottleneck being + ## subsetting an Rle object which needs to be revisited at some point). + uex_cvg <- revElements(uex_cvg, strand(uex) == "-") + + ## STEP 5 - Compute coverage by original exon ('ex_cvg'). + + ex2uex <- (seq_along(sm) - cumsum(!is_unique))[sm] # reverse index + #stopifnot(identical(ex2uex[uex2ex], seq_along(uex2ex))) # sanity + #stopifnot(identical(ex2uex[sm], ex2uex)) # sanity + #stopifnot(all(uex[ex2uex] == ex)) # sanity + + ex_cvg <- uex_cvg[ex2uex] # parallel go 'ex' + + ## STEP 6 - Compute coverage of each transcript by concatenating coverage of + ## its exons. + + ans <- IRanges:::regroupBySupergroup(ex_cvg, transcripts) + + ## STEP 7 - Propagate 'mcols(transcripts)'. + + mcols(ans) <- mcols(transcripts) + ans +} ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/man/coverageByTranscript.Rd b/man/coverageByTranscript.Rd index d2581a3d..ff99bec1 100644 --- a/man/coverageByTranscript.Rd +++ b/man/coverageByTranscript.Rd @@ -14,7 +14,7 @@ } \usage{ -coverageByTranscript(x, transcripts, ignore.strand=FALSE) +coverageByTranscript(x, transcripts, ignore.strand=FALSE, weight = 1L) pcoverageByTranscript(x, transcripts, ignore.strand=FALSE, ...) } @@ -65,6 +65,13 @@ pcoverageByTranscript(x, transcripts, ignore.strand=FALSE, ...) the range to contribute coverage to the exon. If TRUE then the strand is ignored. } + \item{weight}{ + weight assigns a weight to each range in x. + +If x is an IntegerRanges or Views object: each of these arguments must be an integer or numeric vector parallel to x (will get recycled if necessary). Alternatively, each of these arguments can also be specified as a single string naming a metadata column in x (i.e. a column in mcols(x)) to be used as the shift (or weight) vector. Note that when x is an IPos object, each of these arguments can only be a single number. + +If x is an IntegerRangesList object: each of these arguments must be a numeric vector or list-like object of the same length as x (will get recycled if necessary). If it's a numeric vector, it's first turned into a list with as.list. After recycling, each list element shift[[i]] (or weight[[i]]) must be an integer or numeric vector parallel to x[[i]] (will get recycled if necessary). + } \item{...}{ Additional arguments passed to the internal call to \code{\link[GenomicRanges]{grglist}()}. From e3d9b3aa42d192e533531a76c458f80eccf66bf9 Mon Sep 17 00:00:00 2001 From: Roleren Date: Mon, 14 Sep 2020 12:00:34 +0200 Subject: [PATCH 2/2] commented out old version --- R/coverageByTranscript.R | 184 ++++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 91 deletions(-) diff --git a/R/coverageByTranscript.R b/R/coverageByTranscript.R index 44a201f7..101783c9 100644 --- a/R/coverageByTranscript.R +++ b/R/coverageByTranscript.R @@ -35,98 +35,100 @@ ans } -### Computing coverage by transcript (or CDS) of a set of ranges is an -### operation that feels a lot like extracting transcript sequences from a -### genome. Defined as an ordinary function for now. -coverageByTranscript <- function(x, transcripts, ignore.strand=FALSE) -{ - if (!is(transcripts, "GRangesList")) { - transcripts <- try(exonsBy(transcripts, by="tx", use.names=TRUE), - silent=TRUE) - if (is(transcripts, "try-error")) - stop(wmsg("failed to extract the exon ranges ", - "from 'transcripts' with ", - "exonsBy(transcripts, by=\"tx\", use.names=TRUE)")) - } - if (!isTRUEorFALSE(ignore.strand)) - stop(wmsg("'ignore.strand' must be TRUE or FALSE")) - - ## STEP 1 - Infer seqlengths that are missing from 'x' and 'transcripts'. - - ## This will guarantee that subsetting the named RleList object - ## representing the read coverage by the GRanges object representing - ## the set of unique exons won't fail due to out-of-bounds ranges in - ## the subscript. See STEP 3 below. - seqinfo(x) <- .merge_seqinfo_and_infer_missing_seqlengths(x, transcripts) - - ## STEP 2 - Compute unique exons ('uex'). - - ex <- unlist(transcripts, use.names=FALSE) - ## We could simply do 'uex <- unique(ex)' here but we're going to need - ## 'sm' and 'is_unique' later to compute the "reverse index" so we compute - ## them now and use them to extract the unique exons. That way we hash - ## 'ex' only once (the expensive operation). - sm <- selfmatch(ex) # uses a hash table internally - is_unique <- sm == seq_along(sm) - uex2ex <- which(is_unique) # index of unique exons - uex <- ex[uex2ex] # unique exons - - ## STEP 3 - Compute coverage for each unique exon ('uex_cvg'). +### OLD VERSION +# ### Computing coverage by transcript (or CDS) of a set of ranges is an +# ### operation that feels a lot like extracting transcript sequences from a +# ### genome. Defined as an ordinary function for now. +# coverageByTranscript <- function(x, transcripts, ignore.strand=FALSE) +# { +# if (!is(transcripts, "GRangesList")) { +# transcripts <- try(exonsBy(transcripts, by="tx", use.names=TRUE), +# silent=TRUE) +# if (is(transcripts, "try-error")) +# stop(wmsg("failed to extract the exon ranges ", +# "from 'transcripts' with ", +# "exonsBy(transcripts, by=\"tx\", use.names=TRUE)")) +# } +# if (!isTRUEorFALSE(ignore.strand)) +# stop(wmsg("'ignore.strand' must be TRUE or FALSE")) +# +# ## STEP 1 - Infer seqlengths that are missing from 'x' and 'transcripts'. +# +# ## This will guarantee that subsetting the named RleList object +# ## representing the read coverage by the GRanges object representing +# ## the set of unique exons won't fail due to out-of-bounds ranges in +# ## the subscript. See STEP 3 below. +# seqinfo(x) <- .merge_seqinfo_and_infer_missing_seqlengths(x, transcripts) +# +# ## STEP 2 - Compute unique exons ('uex'). +# +# ex <- unlist(transcripts, use.names=FALSE) +# ## We could simply do 'uex <- unique(ex)' here but we're going to need +# ## 'sm' and 'is_unique' later to compute the "reverse index" so we compute +# ## them now and use them to extract the unique exons. That way we hash +# ## 'ex' only once (the expensive operation). +# sm <- selfmatch(ex) # uses a hash table internally +# is_unique <- sm == seq_along(sm) +# uex2ex <- which(is_unique) # index of unique exons +# uex <- ex[uex2ex] # unique exons +# +# ## STEP 3 - Compute coverage for each unique exon ('uex_cvg'). +# +# #There doesn't seem to be much benefit in doing this. +# #x <- subsetByOverlaps(x, transcripts, ignore.strand=TRUE) +# if (ignore.strand) { +# cvg <- coverage(x) +# ## Because we've inferred the seqlengths that are missing from 'x' +# ## and 'transcripts' (see STEP 1 above), 'uex' should never contain +# ## out-of-bounds ranges i.e. the list elements in 'cvg' should always +# ## be Rle objects that are long enough with respect to the ranges +# ## in 'uex'. +# uex_cvg <- cvg[uex] # parallel to 'uex' +# } else { +# x1 <- x[strand(x) %in% c("+", "*")] +# x2 <- x[strand(x) %in% c("-", "*")] +# cvg1 <- coverage(x1) +# cvg2 <- coverage(x2) +# is_plus_ex <- strand(uex) == "+" +# is_minus_ex <- strand(uex) == "-" +# if (!identical(is_plus_ex, !is_minus_ex)) +# stop(wmsg("'transcripts' has exons on the * strand. ", +# "This is not supported at the moment.")) +# ## Because we've inferred the seqlengths that are missing from 'x' +# ## and 'transcripts' (see STEP 1 above), 'uex' should never contain +# ## out-of-bounds ranges i.e. the list elements in 'cvg1' and 'cvg2' +# ## should always be Rle objects that are long enough with respect to +# ## the ranges in 'uex'. +# uex_cvg <- cvg1[uex] +# uex_cvg[is_minus_ex] <- cvg2[uex[is_minus_ex]] +# } +# +# ## STEP 4 - Flip coverage for exons on minus strand. +# +# ## It feels like this is not as fast as it could be (the bottleneck being +# ## subsetting an Rle object which needs to be revisited at some point). +# uex_cvg <- revElements(uex_cvg, strand(uex) == "-") +# +# ## STEP 5 - Compute coverage by original exon ('ex_cvg'). +# +# ex2uex <- (seq_along(sm) - cumsum(!is_unique))[sm] # reverse index +# #stopifnot(identical(ex2uex[uex2ex], seq_along(uex2ex))) # sanity +# #stopifnot(identical(ex2uex[sm], ex2uex)) # sanity +# #stopifnot(all(uex[ex2uex] == ex)) # sanity +# +# ex_cvg <- uex_cvg[ex2uex] # parallel go 'ex' +# +# ## STEP 6 - Compute coverage of each transcript by concatenating coverage of +# ## its exons. +# +# ans <- IRanges:::regroupBySupergroup(ex_cvg, transcripts) +# +# ## STEP 7 - Propagate 'mcols(transcripts)'. +# +# mcols(ans) <- mcols(transcripts) +# ans +# } - #There doesn't seem to be much benefit in doing this. - #x <- subsetByOverlaps(x, transcripts, ignore.strand=TRUE) - if (ignore.strand) { - cvg <- coverage(x) - ## Because we've inferred the seqlengths that are missing from 'x' - ## and 'transcripts' (see STEP 1 above), 'uex' should never contain - ## out-of-bounds ranges i.e. the list elements in 'cvg' should always - ## be Rle objects that are long enough with respect to the ranges - ## in 'uex'. - uex_cvg <- cvg[uex] # parallel to 'uex' - } else { - x1 <- x[strand(x) %in% c("+", "*")] - x2 <- x[strand(x) %in% c("-", "*")] - cvg1 <- coverage(x1) - cvg2 <- coverage(x2) - is_plus_ex <- strand(uex) == "+" - is_minus_ex <- strand(uex) == "-" - if (!identical(is_plus_ex, !is_minus_ex)) - stop(wmsg("'transcripts' has exons on the * strand. ", - "This is not supported at the moment.")) - ## Because we've inferred the seqlengths that are missing from 'x' - ## and 'transcripts' (see STEP 1 above), 'uex' should never contain - ## out-of-bounds ranges i.e. the list elements in 'cvg1' and 'cvg2' - ## should always be Rle objects that are long enough with respect to - ## the ranges in 'uex'. - uex_cvg <- cvg1[uex] - uex_cvg[is_minus_ex] <- cvg2[uex[is_minus_ex]] - } - - ## STEP 4 - Flip coverage for exons on minus strand. - - ## It feels like this is not as fast as it could be (the bottleneck being - ## subsetting an Rle object which needs to be revisited at some point). - uex_cvg <- revElements(uex_cvg, strand(uex) == "-") - - ## STEP 5 - Compute coverage by original exon ('ex_cvg'). - - ex2uex <- (seq_along(sm) - cumsum(!is_unique))[sm] # reverse index - #stopifnot(identical(ex2uex[uex2ex], seq_along(uex2ex))) # sanity - #stopifnot(identical(ex2uex[sm], ex2uex)) # sanity - #stopifnot(all(uex[ex2uex] == ex)) # sanity - - ex_cvg <- uex_cvg[ex2uex] # parallel go 'ex' - - ## STEP 6 - Compute coverage of each transcript by concatenating coverage of - ## its exons. - - ans <- IRanges:::regroupBySupergroup(ex_cvg, transcripts) - - ## STEP 7 - Propagate 'mcols(transcripts)'. - - mcols(ans) <- mcols(transcripts) - ans -} ### Computing coverage by transcript (or CDS) of a set of ranges is an ### operation that feels a lot like extracting transcript sequences from a ### genome. Defined as an ordinary function for now.