diff --git a/DESCRIPTION b/DESCRIPTION index b9e4745b..ced3c911 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: tidybulk Title: Brings transcriptomics to the tidyverse -Version: 2.1.3 +Version: 2.1.4 Authors@R: c(person("Stefano", "Mangiola", email = "mangiolastefano@gmail.com", role = c("aut", "cre")), person("Maria", "Doyle", email = "Maria.Doyle@petermac.org", diff --git a/NAMESPACE b/NAMESPACE index 0f444158..9d2ec657 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,8 +15,10 @@ export(fill_missing_abundance) export(get_X_cibersort) export(get_bibliography) export(identify_abundant) +export(identify_abundant_per_category) export(impute_missing_abundance) export(keep_abundant) +export(keep_abundant_per_category) export(keep_variable) export(log10_reverse_trans) export(logit_trans) diff --git a/R/keep_identify_abundant_per_category.R b/R/keep_identify_abundant_per_category.R new file mode 100644 index 00000000..89828d19 --- /dev/null +++ b/R/keep_identify_abundant_per_category.R @@ -0,0 +1,367 @@ + +#' Identify abundant transcripts/genes in a per-category manner +#' +#' \lifecycle{experimental} +#' +#' @description +#' Identifies transcripts/genes that are consistently expressed above a threshold across samples. +#' This function adds a logical column `.abundant` to indicate which features pass the filtering criteria. +#' +#' @param .data A `SummarizedExperiment` object containing transcript/gene abundance data +#' @param abundance The name of the transcript/gene abundance column (character, preferred) +#' @param design A design matrix for more complex experimental designs. +#' @param formula_design A formula to generate the design matrix. Overrides `design` when both are provided +#' @param minimum_counts The minimum count threshold for a feature to be considered abundant +#' @param minimum_proportion The minimum proportion of samples in which a feature must be abundant +#' @param minimum_count_per_million ... +#' @param minimum_category The minimum number of categories/experimental groups that have sufficient abundance samples. +#' @param force Should existing .abundant column be replaced; defaults to FALSE. +#' @param coerce_design Should non-categorical design matrix be coerced; defaults to FALSE. +#' @param ... Further arguments taken for compatibility, but are not used. +#' +#' @details +#' This function provides an alternative filtering solution for edgeR's filterByExpr() function to identify consistently expressed features. +#' A feature is considered abundant if it has CPM > minimum_counts in at least minimum_proportion +#' of samples in at least minimum_category experimental groups (defined by design/formula_design). +#' +#' @return +#' Returns the input object with an additional logical column `.abundant` indicating which +#' features passed the abundance threshold criteria. +#' +#' @examples +#' ## Load airway dataset for examples +#' +#' data('airway', package = 'airway') +#' # Ensure a 'condition' column exists for examples expecting it +#' +#' SummarizedExperiment::colData(airway)$condition <- SummarizedExperiment::colData(airway)$dex +#' +#' +#' # Basic usage +#' airway |> identify_abundant_per_category() +#' +#' # With custom thresholds +#' airway |> identify_abundant_per_category( +#' minimum_counts = 5, +#' minimum_proportion = 0.5 +#' ) +#' +#' # Using a factor of interest +#' airway |> identify_abundant_per_category(formula_design = ~condition) +#' +#' @references +#' Mangiola, S., Molania, R., Dong, R., Doyle, M. A., & Papenfuss, A. T. (2021). tidybulk: an R tidy framework for modular transcriptomic data analysis. Genome Biology, 22(1), 42. doi:10.1186/s13059-020-02233-7 +#' +#' @references +#' McCarthy, D. J., Chen, Y., & Smyth, G. K. (2012). Differential expression analysis of +#' multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Research, +#' 40(10), 4288-4297. DOI: 10.1093/bioinformatics/btp616 +#' +#' @importFrom dplyr filter +#' @importFrom tidyr drop_na +#' @importFrom magrittr not +#' @importFrom stats as.formula +#' +#' @docType methods +#' @rdname identify_abundant_per_category-methods +#' @export +setGeneric("identify_abundant_per_category", + function(.data, + abundance = assayNames(.data)[1], + design = NULL, + formula_design = NULL, + minimum_counts = 10, + minimum_proportion = 0.7, + minimum_count_per_million = NULL, + minimum_category = 1, + force = FALSE, + coerce_design = FALSE, + ... + ) + standardGeneric("identify_abundant_per_category")) + +.identify_abundant_per_category_se = function( + .data, + abundance = assayNames(.data)[1], + design = NULL, + formula_design = NULL, + minimum_counts = 10, + minimum_proportion = 0.7, + minimum_count_per_million = NULL, + minimum_category=1, + force=FALSE, + coerce_design = FALSE, + ... +) { + # Fix NOTEs + . = NULL + + # If formula_design is provided, use it to create the design matrix + if (!is.null(formula_design)) { + if (!is.null(design)) warning("Both formula_design and design provided; formula_design will be used.") + design <- model.matrix(formula_design, data = colData(.data)) + }else{ + # if neither is provided, populate design from ~ 1 (single-group, intercept only) + if (is.null(design)){ + warning("Neither formula_design and design provided; build design from formula ~ 1 .") + design <- model.matrix( ~ 1, data = colData(.data)) + } + } + + # check if design is categorical + if(any(!design%in%c(0,1))){ + if(coerce_design){ + message("tidybulk says: the design matrix has columns containing elements other than 0 or 1, but user choose to coerce it by setting coerce_design=TRUE.") + non_cat_cols = apply(design,2,function(x){any(!x%in%c(0,1))}) + design[,non_cat_cols] = 1 + }else{ + stop("The design matrix has columns containing elements other than 0 or 1. Filtering aborted. Check design or use coerce_design=TRUE to coerce those columns to 1.") + } + } + + # check if parameters are valid + if (minimum_counts < 0) + stop("The parameter minimum_counts must be >= 0") + + if(!is.null(minimum_count_per_million)){ + if (minimum_count_per_million < 0) + stop("The parameter minimum_count_per_million must be >= 0") + if (minimum_counts > 0) message("tidybulk says: both minimum_counts and minimum_count_per_million are provided. Use only minimum_count_per_million.") + } + + if (minimum_proportion < 0 | minimum_proportion > 1) + stop("The parameter minimum_proportion must be between 0 and 1") + + if (minimum_category < 0) + stop("The parameter minimum_category must be >= 0") + + # If column is present, use this for user awareness + if(".abundant" %in% colnames(rowData(.data))){ + if(force){ + message("tidybulk says: the column .abundant already exists in colData, but user choose to replace it by setting force=TRUE.") + }else{ + message("tidybulk says: the column .abundant already exists in colData. Nothing was done. Use force=TRUE to replace existing column.") + return(.data) + } + } + + # Check if package is installed, otherwise install + check_and_install_packages("edgeR") + + # Get logical vector of abundant features (TRUE = keep/abundant) + # Check if abundant in a sample + if(is.null(minimum_count_per_million)){ + count_is_abundant = assays(.data)[[abundance]] >= minimum_counts + }else{ + CPM = edgeR::cpm(assays(.data)[[abundance]],lib.size=colSums(assays(.data)[[abundance]])) + count_is_abundant = CPM >= minimum_count_per_million + } + + # calculate the proportion of abundant samples across categories for each feature + prop_abundant_per_category = apply(count_is_abundant,1,function(x){ + unname(sapply(split(x,apply(design,1,paste0,collapse="_")),function(xx){ + sum(xx)/length(xx) + })) + },simplify = FALSE) + + # determine if to keep feature based on the proportions + if_keep_feature = sapply(prop_abundant_per_category,function(this_prop){ + sum(this_prop >= minimum_proportion) >= minimum_category + }) + + # add/replace .abundant column + rowData(.data)$.abundant = if_keep_feature + + # Return + .data +} + +#' identify_abundant_per_category +#' @param minimum_count_per_million Minimum CPM cutoff to use for filtering. If provided, this will override the minimum_counts parameter. Default is NULL. +#' +#' @docType methods +#' @rdname identify_abundant_per_category-methods +#' +#' +setMethod("identify_abundant_per_category", + "SummarizedExperiment", + .identify_abundant_per_category_se +) + +#' identify_abundant_per_category +#' @param minimum_count_per_million Minimum CPM cutoff to use for filtering. If provided, this will override the minimum_counts parameter. Default is NULL. +#' +#' @docType methods +#' @rdname identify_abundant_per_category-methods +#' +#' +setMethod("identify_abundant_per_category", + "RangedSummarizedExperiment", + .identify_abundant_per_category_se +) + + + +#' Filter to keep only abundant transcripts/genes in a per-category manner +#' +#' \lifecycle{experimental} +#' +#' @description +#' Filters the data to keep only transcripts/genes that are consistently expressed above +#' a threshold across samples. This is a filtering version of identify_abundant_per_category() that +#' removes low-abundance features instead of just marking them. +#' +#' @param .data A `SummarizedExperiment` object containing transcript/gene abundance data +#' @param abundance The name of the transcript/gene abundance column (character, preferred) +#' @param design A design matrix for more complex experimental designs. +#' @param formula_design A formula to generate the design matrix. Overrides `design` when both are provided +#' @param minimum_counts The minimum count threshold for a feature to be considered abundant +#' @param minimum_proportion The minimum proportion of samples in which a feature must be abundant +#' @param minimum_count_per_million ... +#' @param minimum_category The minimum number of categories/experimental groups that have sufficient abundance samples. +#' @param force Should existing .abundant column be replaced; defaults to FALSE. +#' @param coerce_design Should non-categorical design matrix be coerced; defaults to FALSE. +#' @param ... Further arguments taken for compatibility, but are not used. +#' +#' @details +#' This function provides an alternative filtering solution for edgeR's filterByExpr() function to identify consistently expressed features. +#' A feature is considered abundant if it has CPM > minimum_counts in at least minimum_proportion +#' of samples in at least minimum_category experimental groups (defined by design/formula_design). +#' +#' This function is similar to identify_abundant_per_category() but in addition to writing to an .abundant column, +#' it also filters out the low-abundance features directly. +#' +#' @return +#' Returns a filtered version of the input object containing only the features that passed +#' the abundance threshold criteria. +#' +#' @examples +#' ## Load airway dataset for examples +#' +#' data('airway', package = 'airway') +#' # Ensure a 'condition' column exists for examples expecting it +#' +#' SummarizedExperiment::colData(airway)$condition <- SummarizedExperiment::colData(airway)$dex +#' +#' +#' # Basic usage +#' airway |> keep_abundant_per_category() +#' +#' # With custom thresholds +#' airway |> keep_abundant_per_category( +#' minimum_counts = 5, +#' minimum_proportion = 0.5 +#' ) +#' +#' # Using a factor of interest +#' airway |> keep_abundant_per_category(formula_design = ~condition) +#' +#' @references +#' Mangiola, S., Molania, R., Dong, R., Doyle, M. A., & Papenfuss, A. T. (2021). tidybulk: an R tidy framework for modular transcriptomic data analysis. Genome Biology, 22(1), 42. doi:10.1186/s13059-020-02233-7 +#' +#' @references +#' McCarthy, D. J., Chen, Y., & Smyth, G. K. (2012). Differential expression analysis of +#' multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Research, +#' 40(10), 4288-4297. DOI: 10.1093/bioinformatics/btp616 +#' +#' @importFrom dplyr filter +#' +#' +#' @docType methods +#' @rdname keep_abundant_per_category-methods +#' @export +#' +setGeneric("keep_abundant_per_category", + function(.data, + abundance = assayNames(.data)[1], + design = NULL, + formula_design = NULL, + minimum_counts = 10, + minimum_proportion = 0.7, + minimum_count_per_million = NULL, + minimum_category = 1, + force = FALSE, + coerce_design = FALSE, + ... + ) + standardGeneric("keep_abundant_per_category")) + +.keep_abundant_per_category_se = function(.data, + abundance = assayNames(.data)[1], + design = NULL, + formula_design = NULL, + minimum_counts = 10, + minimum_proportion = 0.7, + minimum_count_per_million = NULL, + minimum_category = 1, + force = FALSE, + coerce_design = FALSE, + ... +) +{ + # Fix NOTEs + . = NULL + + # If formula_design is provided, use it to create the design matrix + if (!is.null(formula_design)) { + if (!is.null(design)) warning("Both formula_design and design provided; formula_design will be used.") + design <- model.matrix(formula_design, data = colData(.data)) + }else{ + # if neither is provided, populate design from ~ 1 (single-group, intercept only) + if (is.null(design)){ + warning("Neither formula_design and design provided; build design from formula ~ 1 .") + design <- model.matrix( ~ 1, data = colData(.data)) + } + } + + # check if design is categorical + if(any(!design%in%c(0,1))){ + if(coerce_design){ + message("tidybulk says: the design matrix has columns containing elements other than 0 or 1, but user choose to coerce it by setting coerce_design=TRUE.") + non_cat_cols = apply(design,2,function(x){any(!x%in%c(0,1))}) + design[,non_cat_cols] = 1 + }else{ + stop("The design matrix has columns containing elements other than 0 or 1. Filtering aborted. Check design or use coerce_design=TRUE to coerce those columns to 1.") + } + } + + # identify abundant features + .data = + .data |> + identify_abundant_per_category( + abundance = abundance, + design = design, + minimum_counts = minimum_counts, + minimum_proportion = minimum_proportion, + minimum_count_per_million = minimum_count_per_million, + minimum_category = minimum_category, + force=force, + coerce_design=coerce_design, + ... + ) + + # filter identified features and return + idx <- rowData(.data)[[".abundant"]] + .data[idx,] +} + +#' keep_abundant_per_category +#' +#' @docType methods +#' @rdname keep_abundant_per_category-methods +#' +setMethod("keep_abundant_per_category", + "SummarizedExperiment", + .keep_abundant_per_category_se) + +#' keep_abundant +#' +#' @docType methods +#' @rdname keep_abundant_per_category-methods +#' +setMethod("keep_abundant_per_category", + "RangedSummarizedExperiment", + .keep_abundant_per_category_se) + + + diff --git a/man/identify_abundant_per_category-methods.Rd b/man/identify_abundant_per_category-methods.Rd new file mode 100644 index 00000000..348be610 --- /dev/null +++ b/man/identify_abundant_per_category-methods.Rd @@ -0,0 +1,119 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/keep_identify_abundant_per_category.R +\docType{methods} +\name{identify_abundant_per_category} +\alias{identify_abundant_per_category} +\alias{identify_abundant_per_category,SummarizedExperiment-method} +\alias{identify_abundant_per_category,RangedSummarizedExperiment-method} +\title{Identify abundant transcripts/genes in a per-category manner} +\usage{ +identify_abundant_per_category( + .data, + abundance = assayNames(.data)[1], + design = NULL, + formula_design = NULL, + minimum_counts = 10, + minimum_proportion = 0.7, + minimum_count_per_million = NULL, + minimum_category = 1, + force = FALSE, + coerce_design = FALSE, + ... +) + +\S4method{identify_abundant_per_category}{SummarizedExperiment}( + .data, + abundance = assayNames(.data)[1], + design = NULL, + formula_design = NULL, + minimum_counts = 10, + minimum_proportion = 0.7, + minimum_count_per_million = NULL, + minimum_category = 1, + force = FALSE, + coerce_design = FALSE, + ... +) + +\S4method{identify_abundant_per_category}{RangedSummarizedExperiment}( + .data, + abundance = assayNames(.data)[1], + design = NULL, + formula_design = NULL, + minimum_counts = 10, + minimum_proportion = 0.7, + minimum_count_per_million = NULL, + minimum_category = 1, + force = FALSE, + coerce_design = FALSE, + ... +) +} +\arguments{ +\item{.data}{A `SummarizedExperiment` object containing transcript/gene abundance data} + +\item{abundance}{The name of the transcript/gene abundance column (character, preferred)} + +\item{design}{A design matrix for more complex experimental designs.} + +\item{formula_design}{A formula to generate the design matrix. Overrides `design` when both are provided} + +\item{minimum_counts}{The minimum count threshold for a feature to be considered abundant} + +\item{minimum_proportion}{The minimum proportion of samples in which a feature must be abundant} + +\item{minimum_count_per_million}{Minimum CPM cutoff to use for filtering. If provided, this will override the minimum_counts parameter. Default is NULL.} + +\item{minimum_category}{The minimum number of categories/experimental groups that have sufficient abundance samples.} + +\item{force}{Should existing .abundant column be replaced; defaults to FALSE.} + +\item{coerce_design}{Should non-categorical design matrix be coerced; defaults to FALSE.} + +\item{...}{Further arguments taken for compatibility, but are not used.} +} +\value{ +Returns the input object with an additional logical column `.abundant` indicating which +features passed the abundance threshold criteria. +} +\description{ +Identifies transcripts/genes that are consistently expressed above a threshold across samples. +This function adds a logical column `.abundant` to indicate which features pass the filtering criteria. +} +\details{ +\lifecycle{experimental} + + +This function provides an alternative filtering solution for edgeR's filterByExpr() function to identify consistently expressed features. +A feature is considered abundant if it has CPM > minimum_counts in at least minimum_proportion +of samples in at least minimum_category experimental groups (defined by design/formula_design). +} +\examples{ +## Load airway dataset for examples + + data('airway', package = 'airway') + # Ensure a 'condition' column exists for examples expecting it + + SummarizedExperiment::colData(airway)$condition <- SummarizedExperiment::colData(airway)$dex + + +# Basic usage +airway |> identify_abundant_per_category() + +# With custom thresholds +airway |> identify_abundant_per_category( + minimum_counts = 5, + minimum_proportion = 0.5 +) + +# Using a factor of interest +airway |> identify_abundant_per_category(formula_design = ~condition) + +} +\references{ +Mangiola, S., Molania, R., Dong, R., Doyle, M. A., & Papenfuss, A. T. (2021). tidybulk: an R tidy framework for modular transcriptomic data analysis. Genome Biology, 22(1), 42. doi:10.1186/s13059-020-02233-7 + +McCarthy, D. J., Chen, Y., & Smyth, G. K. (2012). Differential expression analysis of +multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Research, +40(10), 4288-4297. DOI: 10.1093/bioinformatics/btp616 +} diff --git a/man/keep_abundant_per_category-methods.Rd b/man/keep_abundant_per_category-methods.Rd new file mode 100644 index 00000000..c8f79761 --- /dev/null +++ b/man/keep_abundant_per_category-methods.Rd @@ -0,0 +1,123 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/keep_identify_abundant_per_category.R +\docType{methods} +\name{keep_abundant_per_category} +\alias{keep_abundant_per_category} +\alias{keep_abundant_per_category,SummarizedExperiment-method} +\alias{keep_abundant_per_category,RangedSummarizedExperiment-method} +\title{Filter to keep only abundant transcripts/genes in a per-category manner} +\usage{ +keep_abundant_per_category( + .data, + abundance = assayNames(.data)[1], + design = NULL, + formula_design = NULL, + minimum_counts = 10, + minimum_proportion = 0.7, + minimum_count_per_million = NULL, + minimum_category = 1, + force = FALSE, + coerce_design = FALSE, + ... +) + +\S4method{keep_abundant_per_category}{SummarizedExperiment}( + .data, + abundance = assayNames(.data)[1], + design = NULL, + formula_design = NULL, + minimum_counts = 10, + minimum_proportion = 0.7, + minimum_count_per_million = NULL, + minimum_category = 1, + force = FALSE, + coerce_design = FALSE, + ... +) + +\S4method{keep_abundant_per_category}{RangedSummarizedExperiment}( + .data, + abundance = assayNames(.data)[1], + design = NULL, + formula_design = NULL, + minimum_counts = 10, + minimum_proportion = 0.7, + minimum_count_per_million = NULL, + minimum_category = 1, + force = FALSE, + coerce_design = FALSE, + ... +) +} +\arguments{ +\item{.data}{A `SummarizedExperiment` object containing transcript/gene abundance data} + +\item{abundance}{The name of the transcript/gene abundance column (character, preferred)} + +\item{design}{A design matrix for more complex experimental designs.} + +\item{formula_design}{A formula to generate the design matrix. Overrides `design` when both are provided} + +\item{minimum_counts}{The minimum count threshold for a feature to be considered abundant} + +\item{minimum_proportion}{The minimum proportion of samples in which a feature must be abundant} + +\item{minimum_count_per_million}{...} + +\item{minimum_category}{The minimum number of categories/experimental groups that have sufficient abundance samples.} + +\item{force}{Should existing .abundant column be replaced; defaults to FALSE.} + +\item{coerce_design}{Should non-categorical design matrix be coerced; defaults to FALSE.} + +\item{...}{Further arguments taken for compatibility, but are not used.} +} +\value{ +Returns a filtered version of the input object containing only the features that passed +the abundance threshold criteria. +} +\description{ +Filters the data to keep only transcripts/genes that are consistently expressed above +a threshold across samples. This is a filtering version of identify_abundant_per_category() that +removes low-abundance features instead of just marking them. +} +\details{ +\lifecycle{experimental} + + +This function provides an alternative filtering solution for edgeR's filterByExpr() function to identify consistently expressed features. +A feature is considered abundant if it has CPM > minimum_counts in at least minimum_proportion +of samples in at least minimum_category experimental groups (defined by design/formula_design). + +This function is similar to identify_abundant_per_category() but in addition to writing to an .abundant column, +it also filters out the low-abundance features directly. +} +\examples{ +## Load airway dataset for examples + + data('airway', package = 'airway') + # Ensure a 'condition' column exists for examples expecting it + + SummarizedExperiment::colData(airway)$condition <- SummarizedExperiment::colData(airway)$dex + + +# Basic usage +airway |> keep_abundant_per_category() + +# With custom thresholds +airway |> keep_abundant_per_category( + minimum_counts = 5, + minimum_proportion = 0.5 +) + +# Using a factor of interest +airway |> keep_abundant_per_category(formula_design = ~condition) + +} +\references{ +Mangiola, S., Molania, R., Dong, R., Doyle, M. A., & Papenfuss, A. T. (2021). tidybulk: an R tidy framework for modular transcriptomic data analysis. Genome Biology, 22(1), 42. doi:10.1186/s13059-020-02233-7 + +McCarthy, D. J., Chen, Y., & Smyth, G. K. (2012). Differential expression analysis of +multifactor RNA-Seq experiments with respect to biological variation. Nucleic Acids Research, +40(10), 4288-4297. DOI: 10.1093/bioinformatics/btp616 +}