From c42290abf947a1c06331093428c6af7598fda75e Mon Sep 17 00:00:00 2001 From: Dario Date: Wed, 15 Apr 2026 10:24:59 +0200 Subject: [PATCH 01/10] fixing vignettes nomenclatures --- vignettes/SpaceTrooper_utilities.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/SpaceTrooper_utilities.Rmd b/vignettes/SpaceTrooper_utilities.Rmd index ee68496..d44e414 100644 --- a/vignettes/SpaceTrooper_utilities.Rmd +++ b/vignettes/SpaceTrooper_utilities.Rmd @@ -9,7 +9,7 @@ output: toc: true toc_float: true vignette: > - %\VignetteIndexEntry{Imaging-based Spatial Transcriptomics Data Quality Control with SpaceTrooper} + %\VignetteIndexEntry{SpaceTrooper utilities overview} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: From ddff68f5a255b364a01f094b3e0ea74a13910adc Mon Sep 17 00:00:00 2001 From: Dario Date: Wed, 6 May 2026 10:41:18 +0200 Subject: [PATCH 02/10] transfer/eval coeff between datasets --- inst/scripts/evaluate_qs_split_kfold.R | 726 ++++++++++++++++++++++++ inst/scripts/transfer_qs_coefficients.R | 665 ++++++++++++++++++++++ 2 files changed, 1391 insertions(+) create mode 100644 inst/scripts/evaluate_qs_split_kfold.R create mode 100644 inst/scripts/transfer_qs_coefficients.R diff --git a/inst/scripts/evaluate_qs_split_kfold.R b/inst/scripts/evaluate_qs_split_kfold.R new file mode 100644 index 0000000..faa4d1d --- /dev/null +++ b/inst/scripts/evaluate_qs_split_kfold.R @@ -0,0 +1,726 @@ +library(SpaceTrooper) +library(SummarizedExperiment) +library(S4Vectors) + +# Apply a fitted ridge-logistic QS model to a new SpatialExperiment subset. +# +# The function rebuilds the design matrix from `colData`, using the same +# formula employed during training, and stores predicted probabilities in the +# `QC_score` column. +score_subset <- function(spe_subset, fit, lambda, model_formula) { + # Convert per-cell metadata to a plain data.frame so model.matrix can use it. + df_subset <- as.data.frame(SummarizedExperiment::colData(spe_subset)) + + # Recreate the exact predictor matrix used by glmnet. + x_subset <- stats::model.matrix(stats::as.formula(model_formula), + data=df_subset) + + # Predict the probability of being a good-quality cell. + spe_subset$QC_score <- as.vector( + stats::predict(fit, s=lambda, newx=x_subset, type="response") + ) + spe_subset +} + +# Compute AUROC from ranks, without requiring extra packages. +# +# `label_positive` must be a 0/1 vector, where 1 indicates the positive class. +# In this script we use it both for bad-vs-good and for derived summaries. +auc_rank <- function(score, label_positive) { + # Coerce labels to integer to avoid issues with logical/factor inputs. + label_positive <- as.integer(label_positive) + n_pos <- sum(label_positive == 1L) + n_neg <- sum(label_positive == 0L) + + # AUROC is undefined if one of the two classes is missing. + if (n_pos == 0L || n_neg == 0L) { + return(NA_real_) + } + + # Mann-Whitney / rank-based computation of AUROC. + ranks <- rank(score, ties.method="average") + (sum(ranks[label_positive == 1L]) - n_pos * (n_pos + 1) / 2) / + (n_pos * n_neg) +} + +# Build pseudo-reference labels on a subset, following the same SpaceTrooper +# logic used internally to define training examples. +# +# Returned labels are: +# - `qcscore_train = 0`: low-quality examples +# - `qcscore_train = 1`: good-quality examples +build_reference_labels <- function(spe_subset, verbose=FALSE) { + # Detect outliers for the metrics used by QS. + spe_ref <- computeOutliersQCScore(spe_subset) + + # Remove variables that do not have enough outliers to be informative. + spe_ref <- checkOutliers(spe_ref, verbose=verbose) + + # Build a balanced table of bad and good examples. + df_ref <- computeTrainDF( + colData=SummarizedExperiment::colData(spe_ref), + formulaVars=S4Vectors::metadata(spe_ref)$formula_variables, + tech=S4Vectors::metadata(spe_ref)$technology, + verbose=verbose + ) + + # Keep only the columns needed for downstream evaluation. + df_ref[, c("cell_id", "qcscore_train")] +} + +# Safe wrapper around `build_reference_labels()`. +# +# On small or unlucky splits/folds, SpaceTrooper may not find enough outliers +# to define a reference set. Instead of stopping the whole script, this returns +# the error message so the caller can inspect it. +safe_reference_labels <- function(spe_subset, verbose=FALSE) { + tryCatch( + list(data=build_reference_labels(spe_subset, verbose=verbose), + error=NULL), + error=function(e) list(data=NULL, error=conditionMessage(e)) + ) +} + +# Safely extract an optional scalar character field from a list. +# +# This is used when collecting fold-level error messages at the end of k-fold +# evaluation. Missing, NULL, or NA entries are converted to `NA_character_`. +extract_optional_chr1 <- function(x, name) { + value <- x[[name]] + + if (is.null(value) || length(value) == 0L || all(is.na(value))) { + return(NA_character_) + } + + as.character(value[[1]]) +} + +# Add good/bad/unlabelled labels to a SpatialExperiment subset for plotting. +# +# Labels are derived from the evaluation table: +# - `qcscore_train = 0` -> bad +# - `qcscore_train = 1` -> good +# - missing label -> unlabelled +append_eval_labels <- function( + spe_subset, + df_eval, + label_col="eval_label", + colour_col="eval_label_color" +) { + labels <- rep("unlabelled", ncol(spe_subset)) + + if (!is.null(df_eval) && nrow(df_eval) > 0L) { + idx <- match(spe_subset$cell_id, df_eval$cell_id) + keep <- !is.na(idx) + labels[keep] <- ifelse(df_eval$qcscore_train[idx[keep]] == 0, + "bad", "good") + } + + spe_subset[[label_col]] <- factor(labels, + levels=c("bad", "good", "unlabelled")) + + palette <- c( + bad="#D55E00", + good="#009E73", + unlabelled="#BDBDBD" + ) + spe_subset[[colour_col]] <- unname(palette[as.character(spe_subset[[label_col]])]) + spe_subset +} + +# Select either the whole result object (single split) or one fold result. +# +# This helper keeps the plotting functions compact and makes the API uniform +# across single-split and k-fold outputs. +get_result_level <- function(result, fold=NULL) { + if (identical(result$split_type, "k_fold_cv")) { + if (is.null(fold)) { + stop("Please provide `fold` when plotting a k-fold result") + } + if (length(fold) != 1L || is.na(fold) || fold < 1 || fold > length(result$folds)) { + stop("`fold` must be an integer between 1 and the number of folds") + } + return(result$folds[[fold]]) + } + + result +} + +# Plot cells coloured as good, bad, or unlabelled. +# +# By default the function uses centroids because they are always available in +# the current workflow. For k-fold results, choose which fold to display. +plot_labelled_cells <- function( + result, + dataset=c("train", "test"), + fold=NULL, + size=0.05, + alpha=0.8, + scaleBar=TRUE +) { + dataset <- match.arg(dataset) + level_result <- get_result_level(result, fold=fold) + + spe_subset <- switch(dataset, + train=level_result$spe_train, + test=level_result$spe_test + ) + df_eval <- switch(dataset, + train=level_result$train_eval, + test=level_result$test_eval + ) + + spe_subset <- append_eval_labels(spe_subset, df_eval) + + title_suffix <- if (identical(result$split_type, "k_fold_cv")) { + paste0(" - fold ", fold) + } else { + "" + } + + plotCentroids( + spe_subset, + colourBy="eval_label", + palette="eval_label_color", + size=size, + alpha=alpha, + scaleBar=scaleBar + ) + + ggplot2::labs( + title=paste0("Labelled cells (", dataset, ")", title_suffix), + colour="Reference label", + fill="Reference label" + ) +} + +# Compute the points of a ROC curve from evaluation labels and QS values. +# +# The positive class is the bad-quality group, so the curve is built using +# `1 - QC_score` as the decision score. +build_roc_df <- function(df_eval) { + if (is.null(df_eval) || nrow(df_eval) == 0L) { + return(data.frame()) + } + + label_positive <- as.integer(df_eval$qcscore_train == 0) + n_pos <- sum(label_positive == 1L) + n_neg <- sum(label_positive == 0L) + + if (n_pos == 0L || n_neg == 0L) { + return(data.frame()) + } + + score_bad <- 1 - df_eval$QC_score + ord <- order(score_bad, decreasing=TRUE) + score_bad <- score_bad[ord] + label_positive <- label_positive[ord] + + tp <- cumsum(label_positive == 1L) + fp <- cumsum(label_positive == 0L) + change <- c(score_bad[-1] != score_bad[-length(score_bad)], TRUE) + + data.frame( + fpr=c(0, fp[change] / n_neg), + tpr=c(0, tp[change] / n_pos) + ) +} + +# Plot ROC curves for a single split or across folds. +# +# In k-fold mode the function overlays one ROC curve per fold and reports the +# average AUC in the subtitle. +plot_roc_eval <- function(result, dataset=c("train", "test")) { + dataset <- match.arg(dataset) + + if (identical(result$split_type, "k_fold_cv")) { + roc_list <- lapply(seq_along(result$folds), function(i) { + df_eval <- switch(dataset, + train=result$folds[[i]]$train_eval, + test=result$folds[[i]]$test_eval + ) + roc_df <- build_roc_df(df_eval) + if (nrow(roc_df) == 0L) { + return(NULL) + } + roc_df$fold <- i + roc_df + }) + roc_list <- Filter(Negate(is.null), roc_list) + + if (length(roc_list) == 0L) { + stop("No ROC curves available for the selected dataset") + } + + roc_df <- do.call(rbind, roc_list) + auc_mean <- switch(dataset, + train=result$train_summary_average$auc_bad_vs_good, + test=result$test_summary_average$auc_bad_vs_good + ) + + return( + ggplot2::ggplot(roc_df, + ggplot2::aes(x=fpr, y=tpr, colour=factor(fold), group=fold)) + + ggplot2::geom_abline(intercept=0, slope=1, linetype=2, + colour="grey60") + + ggplot2::geom_path(linewidth=0.7, alpha=0.8) + + ggplot2::coord_fixed() + + ggplot2::theme_bw() + + ggplot2::labs( + title=paste0("ROC curves across folds (", dataset, ")"), + subtitle=paste0("Mean AUC = ", round(auc_mean, 4)), + x="False positive rate", + y="True positive rate", + colour="Fold" + ) + ) + } + + df_eval <- switch(dataset, + train=result$train_eval, + test=result$test_eval + ) + roc_df <- build_roc_df(df_eval) + + if (nrow(roc_df) == 0L) { + stop("No ROC curve available for the selected dataset") + } + + auc_value <- switch(dataset, + train=result$train_summary$auc_bad_vs_good, + test=result$test_summary$auc_bad_vs_good + ) + + ggplot2::ggplot(roc_df, ggplot2::aes(x=fpr, y=tpr)) + + ggplot2::geom_abline(intercept=0, slope=1, linetype=2, + colour="grey60") + + ggplot2::geom_path(linewidth=0.9, colour="#0072B2") + + ggplot2::coord_fixed() + + ggplot2::theme_bw() + + ggplot2::labs( + title=paste0("ROC curve (", dataset, ")"), + subtitle=paste0("AUC = ", round(auc_value, 4)), + x="False positive rate", + y="True positive rate" + ) +} + +# Plot AUC values for each fold. +# +# In single-split mode the function returns a one-point summary. In k-fold mode +# it shows one point per fold and a dashed line for the average AUC. +plot_auc_across_folds <- function(result, dataset=c("train", "test")) { + dataset <- match.arg(dataset) + + if (identical(result$split_type, "k_fold_cv")) { + auc_df <- switch(dataset, + train=result$train_summary_per_fold, + test=result$test_summary_per_fold + ) + auc_mean <- switch(dataset, + train=result$train_summary_average$auc_bad_vs_good, + test=result$test_summary_average$auc_bad_vs_good + ) + + return( + ggplot2::ggplot(auc_df, + ggplot2::aes(x=fold, y=auc_bad_vs_good)) + + ggplot2::geom_hline(yintercept=auc_mean, linetype=2, + colour="#D55E00") + + ggplot2::geom_line(colour="#0072B2") + + ggplot2::geom_point(size=2.2, colour="#0072B2") + + ggplot2::theme_bw() + + ggplot2::labs( + title=paste0("AUC across folds (", dataset, ")"), + subtitle=paste0("Dashed line = mean AUC = ", round(auc_mean, 4)), + x="Fold", + y="AUC" + ) + ) + } + + auc_value <- switch(dataset, + train=result$train_summary$auc_bad_vs_good, + test=result$test_summary$auc_bad_vs_good + ) + auc_df <- data.frame(split=1, auc_bad_vs_good=auc_value) + + ggplot2::ggplot(auc_df, ggplot2::aes(x=split, y=auc_bad_vs_good)) + + ggplot2::geom_point(size=3, colour="#0072B2") + + ggplot2::theme_bw() + + ggplot2::scale_x_continuous(breaks=1, labels="split") + + ggplot2::labs( + title=paste0("AUC summary (", dataset, ")"), + subtitle=paste0("AUC = ", round(auc_value, 4)), + x=NULL, + y="AUC" + ) +} + +# Summarise QS performance against pseudo-reference labels. +# +# The summary reports: +# - number of labelled cells +# - number of bad/good examples +# - median QS in bad and good examples +# - AUROC for separating bad from good +# - sensitivity/specificity at the chosen threshold +summarise_eval <- function(df_eval, threshold=0.5) { + # Return an empty data.frame if the evaluation table is missing. + if (is.null(df_eval) || nrow(df_eval) == 0L) { + return(data.frame()) + } + + # Derive boolean masks for the pseudo-reference classes. + is_bad <- df_eval$qcscore_train == 0 + is_good <- df_eval$qcscore_train == 1 + + # A cell is predicted as bad if its QS is below the chosen threshold. + pred_bad <- df_eval$QC_score < threshold + + data.frame( + n_labeled=nrow(df_eval), + n_bad=sum(is_bad), + n_good=sum(is_good), + median_qs_bad=if (sum(is_bad) > 0) { + stats::median(df_eval$QC_score[is_bad]) + } else { + NA_real_ + }, + median_qs_good=if (sum(is_good) > 0) { + stats::median(df_eval$QC_score[is_good]) + } else { + NA_real_ + }, + # `1 - QC_score` is used so that larger values correspond to worse cells. + auc_bad_vs_good=auc_rank(1 - df_eval$QC_score, as.integer(is_bad)), + sensitivity_bad_qs_lt_threshold=if (sum(is_bad) > 0) { + mean(pred_bad[is_bad]) + } else { + NA_real_ + }, + specificity_good_qs_ge_threshold=if (sum(is_good) > 0) { + mean(!pred_bad[is_good]) + } else { + NA_real_ + } + ) +} + +# Create train/test indices for a single random split. +# +# `train_fraction` is only used when `k_folds = 1`. It controls the proportion +# of cells assigned to the training set, allowing e.g. 50/50, 70/30, 80/20. +make_random_split <- function(n_cells, train_fraction, seed) { + stopifnot(length(n_cells) == 1L, n_cells > 1L) + stopifnot(length(train_fraction) == 1L, !is.na(train_fraction)) + + if (train_fraction <= 0 || train_fraction >= 1) { + stop("train_fraction must be strictly between 0 and 1") + } + + set.seed(seed) + + # Sample the training cells at random. + n_train <- floor(n_cells * train_fraction) + if (n_train < 1L || n_train >= n_cells) { + stop("train_fraction produces an empty training or test set") + } + + train_idx <- sample(seq_len(n_cells), size=n_train, replace=FALSE) + test_idx <- setdiff(seq_len(n_cells), train_idx) + + list(train_idx=train_idx, test_idx=test_idx) +} + +# Assign each cell to one of the k folds used in cross-validation. +# +# Each fold acts as test set once, while the remaining folds are used for +# training. This function is only used when `k_folds > 1`. +make_fold_ids <- function(n_cells, k_folds, seed) { + stopifnot(length(n_cells) == 1L, n_cells > 1L) + stopifnot(length(k_folds) == 1L, !is.na(k_folds), k_folds >= 1) + + k_folds <- as.integer(k_folds) + if (k_folds < 1L) { + stop("k_folds must be >= 1") + } + + set.seed(seed) + + if (k_folds > n_cells) { + stop("k_folds cannot be greater than the number of cells") + } + + # Shuffle cells and then distribute them approximately evenly across folds. + shuffled <- sample(seq_len(n_cells)) + fold_id <- integer(n_cells) + fold_id[shuffled] <- rep(seq_len(k_folds), length.out=n_cells) + fold_id +} + +# Train the QS model on one training split/fold and evaluate it on both train +# and test cells. +# +# This is the core routine used by both single-split validation and k-fold CV. +run_one_fold <- function(spe, test_idx, threshold=0.5, verbose=TRUE) { + # Partition the input SpatialExperiment into training and test subsets. + spe_train <- spe[, -test_idx] + spe_test <- spe[, test_idx] + + # Compute outliers only on training cells, so the model is fitted without + # information leakage from the test set. + spe_train <- computeOutliersQCScore(spe_train) + spe_train <- checkOutliers(spe_train, verbose=verbose) + + # Extract the training formula selected by SpaceTrooper. + formula_vars <- S4Vectors::metadata(spe_train)$formula_variables + model_formula <- getModelFormula(formula_vars, verbose=verbose) + + # Create the balanced training table of pseudo-bad and pseudo-good cells. + df_train <- computeTrainDF( + colData=SummarizedExperiment::colData(spe_train), + formulaVars=formula_vars, + tech=S4Vectors::metadata(spe_train)$technology, + verbose=verbose + ) + + # Build the design matrix and estimate the ridge penalty. + x_train <- stats::model.matrix(stats::as.formula(model_formula), data=df_train) + best_lambda <- computeLambda(df_train, model_formula) + + # Fit the ridge logistic model. + fit <- trainModel(x_train, df_train) + + # Score both the training and test subsets using the trained model. + spe_train <- score_subset(spe_train, fit, best_lambda, model_formula) + spe_test <- score_subset(spe_test, fit, best_lambda, model_formula) + + # Training evaluation uses the very same labelled examples used for fitting. + train_eval <- merge( + data.frame(cell_id=spe_train$cell_id, QC_score=spe_train$QC_score), + df_train[, c("cell_id", "qcscore_train")], + by="cell_id" + ) + + # On the test set, build pseudo-reference labels independently. + test_ref <- safe_reference_labels(spe_test, verbose=verbose) + test_eval <- if (is.null(test_ref$data)) { + NULL + } else { + merge( + data.frame(cell_id=spe_test$cell_id, QC_score=spe_test$QC_score), + test_ref$data, + by="cell_id" + ) + } + + # Return both summaries and raw objects/tables for further inspection. + list( + formula_variables=formula_vars, + model_formula=model_formula, + lambda=best_lambda, + train_class_balance=table(df_train$qcscore_train), + train_label_error=NA_character_, + train_summary=summarise_eval(train_eval, threshold=threshold), + test_summary=summarise_eval(test_eval, threshold=threshold), + test_label_error=test_ref$error, + spe_train=spe_train, + spe_test=spe_test, + train_eval=train_eval, + test_eval=test_eval + ) +} + +# Combine per-fold summaries into a single table and compute their mean. +# +# This is only used in k-fold mode. The output contains: +# - `per_fold`: one row per fold +# - `average`: average across folds for each metric +combine_fold_summaries <- function(fold_results, summary_name) { + summaries <- lapply(seq_along(fold_results), function(i) { + x <- fold_results[[i]][[summary_name]] + if (is.null(x) || nrow(x) == 0L) { + return(NULL) + } + x$fold <- i + x + }) + summaries <- Filter(Negate(is.null), summaries) + + if (length(summaries) == 0L) { + return(list(per_fold=data.frame(), average=data.frame())) + } + + per_fold <- do.call(rbind, summaries) + metric_cols <- setdiff(colnames(per_fold), "fold") + + # Average each numeric summary metric across folds. + average <- as.data.frame( + lapply(per_fold[, metric_cols, drop=FALSE], function(x) mean(x, na.rm=TRUE)) + ) + + list(per_fold=per_fold, average=average) +} + +# Main entry point for evaluating SpaceTrooper QS on a CosMx dataset. +# +# Modes: +# - `k_folds = 1`: single random split using `train_fraction` +# - `k_folds > 1`: k-fold cross-validation +# +# Important notes: +# - `train_fraction` is ignored when `k_folds > 1` +# - `threshold` is used only for classification summaries, not for model fitting +evaluate_qs_split <- function( + data_dir="/Users/inzirio/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2", + sample_name="CosMx_Case2", + seed=1713, + threshold=0.5, + k_folds=1, + train_fraction=0.5, + verbose=TRUE +) { + # Validate the high-level input parameters. + stopifnot(length(k_folds) == 1L, !is.na(k_folds), k_folds >= 1) + k_folds <- as.integer(k_folds) + stopifnot(length(train_fraction) == 1L, !is.na(train_fraction)) + + if (train_fraction <= 0 || train_fraction >= 1) { + stop("train_fraction must be strictly between 0 and 1") + } + + # Read the CosMx dataset from disk. + spe <- readCosmxSPE(data_dir, sampleName=sample_name) + + # Compute the per-cell QC metrics required by the QS model. + spe <- spatialPerCellQC( + spe, + rmZeros=TRUE, + negProbList=c("NegPrb", "Negative", "SystemControl") + ) + + # Count the number of cells remaining after QC preprocessing. + n_cells <- ncol(spe) + + if (k_folds == 1L) { + # Single random split: build train/test indices according to the + # requested training fraction. + split_idx <- make_random_split( + n_cells=n_cells, + train_fraction=train_fraction, + seed=seed + ) + + res <- run_one_fold( + spe=spe, + test_idx=split_idx$test_idx, + threshold=threshold, + verbose=verbose + ) + + return(c( + list( + split_type="random_split", + k_folds=k_folds, + seed=seed, + threshold=threshold, + train_fraction=train_fraction, + test_fraction=1 - train_fraction, + n_cells=n_cells, + n_train=length(split_idx$train_idx), + n_test=length(split_idx$test_idx) + ), + res + )) + } + + # K-fold mode: assign each cell to a fold once. + fold_id <- make_fold_ids(n_cells=n_cells, k_folds=k_folds, seed=seed) + + fold_results <- lapply(seq_len(k_folds), function(i) { + if (verbose) { + message("Running fold ", i, " of ", k_folds) + } + + # Fold i is the test set; all other folds are the training set. + run_one_fold( + spe=spe, + test_idx=which(fold_id == i), + threshold=threshold, + verbose=verbose + ) + }) + + train_combined <- combine_fold_summaries(fold_results, "train_summary") + test_combined <- combine_fold_summaries(fold_results, "test_summary") + test_label_errors <- vapply( + fold_results, + extract_optional_chr1, + character(1), + name = "test_label_error" + ) + + train_label_errors <- vapply( + fold_results, + extract_optional_chr1, + character(1), + name = "train_label_error" + ) + + # if you only want real errors later, drop missing values explicitly + test_label_errors <- stats::na.omit(test_label_errors) + train_label_errors <- stats::na.omit(train_label_errors) + + list( + split_type="k_fold_cv", + k_folds=k_folds, + seed=seed, + threshold=threshold, + n_cells=n_cells, + fold_assignment=fold_id, + train_summary_per_fold=train_combined$per_fold, + train_summary_average=train_combined$average, + test_summary_per_fold=test_combined$per_fold, + test_summary_average=test_combined$average, + test_label_errors=test_label_errors, + train_label_errors=train_label_errors, + folds=fold_results + ) +} + +# Esempi di utilizzo: +# source("inst/scripts/evaluate_qs_split_kfold.R") +# +# Split singolo 50/50 +# res_split_50_50 <- evaluate_qs_split(k_folds=1, train_fraction=0.50) +# res_split_50_50$test_summary +# +# Split singolo 70/30 +# res_split_70_30 <- evaluate_qs_split(k_folds=1, train_fraction=0.70) +# res_split_70_30$test_summary +res_split <- evaluate_qs_split(k_folds=1, train_fraction=0.70) + +plot_labelled_cells(res_split, dataset="train", size=0.08) +plot_labelled_cells(res_split, dataset="test", size=0.08) + +plot_roc_eval(res_split, dataset="test") +plot_auc_across_folds(res_split, dataset="test") +# +# Split singolo 80/20 +# res_split_80_20 <- evaluate_qs_split(k_folds=1, train_fraction=0.80) +# res_split_80_20$test_summary +# +# 5-fold cross-validation +res_kfold_5 <- evaluate_qs_split(k_folds=5) +res_kfold_5$test_summary_per_fold +res_kfold_5$test_summary_average +res_kfold_5$test_label_errors +# +# Celle colorate good/bad sul test set del fold 1 +plot_labelled_cells(res_kfold_5, dataset="test", fold=1, size=0.1) +# +# ROC curve across folds +plot_roc_eval(res_kfold_5, dataset="test") +# +# AUC across folds +plot_auc_across_folds(res_kfold_5, dataset="test") diff --git a/inst/scripts/transfer_qs_coefficients.R b/inst/scripts/transfer_qs_coefficients.R new file mode 100644 index 0000000..27b4c3a --- /dev/null +++ b/inst/scripts/transfer_qs_coefficients.R @@ -0,0 +1,665 @@ +library(SpaceTrooper) +library(SummarizedExperiment) +library(S4Vectors) + +# Read a dataset with the appropriate SpaceTrooper reader according to the +# selected technology. +# +# Supported values are: +# - "cosmx" +# - "cosmx_protein" +# - "xenium" +read_dataset_for_transfer <- function( + dir_name, + technology=c("cosmx", "cosmx_protein", "xenium"), + sample_name="sample01" +) { + technology <- match.arg(technology) + + switch( + technology, + cosmx=readCosmxSPE(dir_name, sampleName=sample_name), + cosmx_protein=readCosmxProteinSPE(dir_name, sampleName=sample_name), + xenium=readXeniumSPE( + dir_name, + sampleName=sample_name, + computeMissingMetrics=TRUE, + keepPolygons=FALSE + ) + ) +} + +# Compute the per-cell QC metrics required before training or applying QS. +# +# The same preprocessing is applied to both source and target datasets so that +# transferred coefficients operate on harmonised metric columns. +prepare_dataset_for_transfer <- function( + spe, + rm_zeros=TRUE, + neg_prob_list=c("NegPrb", "Negative", "SystemControl") +) { + spatialPerCellQC( + spe, + rmZeros=rm_zeros, + negProbList=neg_prob_list + ) +} + +# Identify the metric set that can be transferred from source to target. +# +# For heterogeneous transfers, the safest common metrics are: +# - log2SignalDensity +# - Area_um +# - log2Ctrl_total_ratio +# +# The border-dependent aspect-ratio term is included only if both datasets have +# `log2AspectRatio` and `dist_border`, which is typically the case for CosMx to +# CosMx transfers. +get_transferable_metrics <- function(source_spe, target_spe) { + source_cols <- colnames(SummarizedExperiment::colData(source_spe)) + target_cols <- colnames(SummarizedExperiment::colData(target_spe)) + shared_cols <- intersect(source_cols, target_cols) + + metrics <- intersect( + c("log2SignalDensity", "Area_um", "log2Ctrl_total_ratio"), + shared_cols + ) + + if (all(c("log2AspectRatio", "dist_border") %in% source_cols) && + all(c("log2AspectRatio", "dist_border") %in% target_cols)) { + metrics <- c(metrics, "log2AspectRatio") + } + + unique(metrics) +} + +# Extract a readable coefficient table from a fitted glmnet model. +# +# Coefficients are returned at the selected lambda, so they are exactly the ones +# transferred to the target dataset. +extract_qs_coefficients <- function(fit, lambda) { + coef_mat <- as.matrix(stats::predict(fit, s=lambda, type="coefficients")) + data.frame( + term=rownames(coef_mat), + coefficient=as.numeric(coef_mat[, 1]), + row.names=NULL + ) +} + +# Apply a fitted QS model to a new SpatialExperiment object. +# +# The design matrix is rebuilt from the target `colData` using the compatible +# transfer formula trained on the source dataset. +score_target_dataset <- function(spe, fit, lambda, model_formula) { + df <- as.data.frame(SummarizedExperiment::colData(spe)) + x <- stats::model.matrix(stats::as.formula(model_formula), data=df) + spe$QC_score <- as.vector( + stats::predict(fit, s=lambda, newx=x, type="response") + ) + spe +} + +# Compute AUROC with a rank-based formula, avoiding extra package dependencies. +auc_rank <- function(score, label_positive) { + label_positive <- as.integer(label_positive) + n_pos <- sum(label_positive == 1L) + n_neg <- sum(label_positive == 0L) + + if (n_pos == 0L || n_neg == 0L) { + return(NA_real_) + } + + ranks <- rank(score, ties.method="average") + (sum(ranks[label_positive == 1L]) - n_pos * (n_pos + 1) / 2) / + (n_pos * n_neg) +} + +# Build pseudo-reference labels on a dataset using the same metric subset used +# for transfer. +# +# This is useful for evaluating how well transferred coefficients separate +# pseudo-bad from pseudo-good cells on the target dataset. +build_reference_labels_transfer <- function(spe, metric_list, verbose=FALSE) { + spe_ref <- computeOutliersQCScore(spe, metricList=metric_list) + spe_ref <- checkOutliers(spe_ref, verbose=verbose) + + df_ref <- computeTrainDF( + colData=SummarizedExperiment::colData(spe_ref), + formulaVars=S4Vectors::metadata(spe_ref)$formula_variables, + tech=S4Vectors::metadata(spe_ref)$technology, + verbose=verbose + ) + + df_ref[, c("cell_id", "qcscore_train")] +} + +# Safe wrapper for pseudo-reference label generation. +safe_reference_labels_transfer <- function(spe, metric_list, verbose=FALSE) { + tryCatch( + list( + data=build_reference_labels_transfer( + spe, + metric_list=metric_list, + verbose=verbose + ), + error=NULL + ), + error=function(e) list(data=NULL, error=conditionMessage(e)) + ) +} + +# Summarise the separation between pseudo-bad and pseudo-good cells. +# +# The summary is computed on either source or target after scoring with the +# transferred model. +summarise_transfer_eval <- function(df_eval, threshold=0.5) { + if (is.null(df_eval) || nrow(df_eval) == 0L) { + return(data.frame()) + } + + is_bad <- df_eval$qcscore_train == 0 + is_good <- df_eval$qcscore_train == 1 + pred_bad <- df_eval$QC_score < threshold + + data.frame( + n_labeled=nrow(df_eval), + n_bad=sum(is_bad), + n_good=sum(is_good), + median_qs_bad=if (sum(is_bad) > 0) { + stats::median(df_eval$QC_score[is_bad]) + } else { + NA_real_ + }, + median_qs_good=if (sum(is_good) > 0) { + stats::median(df_eval$QC_score[is_good]) + } else { + NA_real_ + }, + auc_bad_vs_good=auc_rank(1 - df_eval$QC_score, as.integer(is_bad)), + sensitivity_bad_qs_lt_threshold=if (sum(is_bad) > 0) { + mean(pred_bad[is_bad]) + } else { + NA_real_ + }, + specificity_good_qs_ge_threshold=if (sum(is_good) > 0) { + mean(!pred_bad[is_good]) + } else { + NA_real_ + } + ) +} + +# Add good/bad/unlabelled labels to a SpatialExperiment object for plotting. +# +# Labels are derived from the evaluation table built for either the source or +# the target dataset after scoring with transferred coefficients. +append_transfer_labels <- function( + spe, + df_eval, + label_col="transfer_label", + colour_col="transfer_label_color" +) { + labels <- rep("unlabelled", ncol(spe)) + + if (!is.null(df_eval) && nrow(df_eval) > 0L) { + idx <- match(spe$cell_id, df_eval$cell_id) + keep <- !is.na(idx) + labels[keep] <- ifelse(df_eval$qcscore_train[idx[keep]] == 0, + "bad", "good") + } + + spe[[label_col]] <- factor(labels, levels=c("bad", "good", "unlabelled")) + palette <- c(bad="#D55E00", good="#009E73", unlabelled="#BDBDBD") + spe[[colour_col]] <- unname(palette[as.character(spe[[label_col]])]) + spe +} + +# Build a ROC data.frame from pseudo-reference labels and transferred QS. +build_transfer_roc_df <- function(df_eval) { + if (is.null(df_eval) || nrow(df_eval) == 0L) { + return(data.frame()) + } + + label_positive <- as.integer(df_eval$qcscore_train == 0) + n_pos <- sum(label_positive == 1L) + n_neg <- sum(label_positive == 0L) + + if (n_pos == 0L || n_neg == 0L) { + return(data.frame()) + } + + score_bad <- 1 - df_eval$QC_score + ord <- order(score_bad, decreasing=TRUE) + score_bad <- score_bad[ord] + label_positive <- label_positive[ord] + + tp <- cumsum(label_positive == 1L) + fp <- cumsum(label_positive == 0L) + change <- c(score_bad[-1] != score_bad[-length(score_bad)], TRUE) + + data.frame( + fpr=c(0, fp[change] / n_neg), + tpr=c(0, tp[change] / n_pos) + ) +} + +# Plot source or target cells coloured as pseudo-good / pseudo-bad after +# scoring with transferred coefficients. +plot_transfer_labelled_cells <- function( + result, + dataset=c("source", "target"), + size=0.05, + alpha=0.8, + scaleBar=TRUE +) { + dataset <- match.arg(dataset) + + spe <- switch(dataset, + source=result$source_spe, + target=result$target_spe + ) + df_eval <- switch(dataset, + source=result$source_eval, + target=result$target_eval + ) + + spe <- append_transfer_labels(spe, df_eval) + + plotCentroids( + spe, + colourBy="transfer_label", + palette="transfer_label_color", + size=size, + alpha=alpha, + scaleBar=scaleBar + ) + + ggplot2::labs( + title=paste0("Transferred QS labels on ", dataset, " dataset"), + colour="Reference label", + fill="Reference label" + ) +} + +# Plot the spatial distribution of transferred QS on source or target. +plot_transfer_qs <- function( + result, + dataset=c("source", "target"), + size=0.05, + alpha=0.8, + scaleBar=TRUE +) { + dataset <- match.arg(dataset) + + spe <- switch(dataset, + source=result$source_spe, + target=result$target_spe + ) + + plotCentroids( + spe, + colourBy="QC_score", + size=size, + alpha=alpha, + scaleBar=scaleBar + ) + + ggplot2::labs(title=paste0("Transferred QC score on ", dataset, " dataset")) +} + +# Plot ROC curves for the source or target transfer evaluation. +plot_transfer_roc <- function(result, dataset=c("source", "target")) { + dataset <- match.arg(dataset) + + df_eval <- switch(dataset, + source=result$source_eval, + target=result$target_eval + ) + roc_df <- build_transfer_roc_df(df_eval) + + if (nrow(roc_df) == 0L) { + stop("No ROC curve available for the selected dataset") + } + + auc_value <- switch(dataset, + source=result$source_summary$auc_bad_vs_good, + target=result$target_summary$auc_bad_vs_good + ) + + ggplot2::ggplot(roc_df, ggplot2::aes(x=fpr, y=tpr)) + + ggplot2::geom_abline(intercept=0, slope=1, linetype=2, + colour="grey60") + + ggplot2::geom_path(linewidth=0.9, colour="#0072B2") + + ggplot2::coord_fixed() + + ggplot2::theme_bw() + + ggplot2::labs( + title=paste0("ROC curve for transferred QS (", dataset, ")"), + subtitle=paste0("AUC = ", round(auc_value, 4)), + x="False positive rate", + y="True positive rate" + ) +} + +# Compare AUC between source and target after transfer. +plot_transfer_auc_summary <- function(result) { + auc_df <- data.frame( + dataset=c("source", "target"), + auc_bad_vs_good=c( + result$source_summary$auc_bad_vs_good, + if (nrow(result$target_summary) == 0L) NA_real_ else result$target_summary$auc_bad_vs_good + ) + ) + + ggplot2::ggplot(auc_df, ggplot2::aes(x=dataset, y=auc_bad_vs_good, fill=dataset)) + + ggplot2::geom_col(width=0.65, alpha=0.85, show.legend=FALSE) + + ggplot2::geom_text(ggplot2::aes(label=round(auc_bad_vs_good, 4)), + vjust=-0.4, na.rm=TRUE) + + ggplot2::theme_bw() + + ggplot2::labs( + title="AUC summary for transferred QS", + x=NULL, + y="AUC" + ) +} + +# Train a native QS model directly on the target dataset, so it can be compared +# against the transferred QS. +# +# By default it uses the full SpaceTrooper native workflow on the target. +# If `use_transfer_metrics = TRUE`, it restricts the native training to the same +# metric subset used by the transferred model. +compute_native_target_qs <- function( + result, + use_transfer_metrics=FALSE, + verbose=TRUE +) { + target_native <- result$target_spe + target_native$QC_score_transfer <- target_native$QC_score + + if (isTRUE(use_transfer_metrics)) { + target_native <- computeOutliersQCScore( + target_native, + metricList=result$transfer_metrics + ) + target_native <- checkOutliers(target_native, verbose=verbose) + + formula_vars <- S4Vectors::metadata(target_native)$formula_variables + model_formula <- getModelFormula(formula_vars, verbose=verbose) + train_df <- computeTrainDF( + colData=SummarizedExperiment::colData(target_native), + formulaVars=formula_vars, + tech=S4Vectors::metadata(target_native)$technology, + verbose=verbose + ) + x_target <- stats::model.matrix(stats::as.formula(model_formula), data=train_df) + best_lambda <- computeLambda(train_df, model_formula) + fit_native <- trainModel(x_target, train_df) + target_native$QC_score_native <- as.vector( + stats::predict( + fit_native, + s=best_lambda, + newx=stats::model.matrix( + stats::as.formula(model_formula), + data=as.data.frame(SummarizedExperiment::colData(target_native)) + ), + type="response" + ) + ) + return(target_native) + } + + target_native <- computeQCScore(target_native, verbose=verbose) + target_native$QC_score_native <- target_native$QC_score + target_native$QC_score <- target_native$QC_score_transfer + target_native +} + +# Compare transferred QS with a native QS trained directly on the target. +# +# The function reports correlations and agreement on low-quality calls. +compare_transferred_vs_native <- function( + result, + threshold=0.5, + low_fraction=0.1, + use_transfer_metrics=FALSE, + verbose=TRUE +) { + stopifnot(length(low_fraction) == 1L, low_fraction > 0, low_fraction < 1) + + target_native <- compute_native_target_qs( + result, + use_transfer_metrics=use_transfer_metrics, + verbose=verbose + ) + + df_compare <- data.frame( + cell_id=target_native$cell_id, + qc_transfer=target_native$QC_score_transfer, + qc_native=target_native$QC_score_native + ) + + transfer_cut <- stats::quantile(df_compare$qc_transfer, probs=low_fraction, na.rm=TRUE) + native_cut <- stats::quantile(df_compare$qc_native, probs=low_fraction, na.rm=TRUE) + + comparison_summary <- data.frame( + pearson=stats::cor(df_compare$qc_transfer, df_compare$qc_native, + method="pearson", use="complete.obs"), + spearman=stats::cor(df_compare$qc_transfer, df_compare$qc_native, + method="spearman", use="complete.obs"), + low_qc_agreement_at_threshold=mean( + (df_compare$qc_transfer < threshold) == (df_compare$qc_native < threshold), + na.rm=TRUE + ), + overlap_lowest_fraction=mean( + (df_compare$qc_transfer <= transfer_cut) & + (df_compare$qc_native <= native_cut), + na.rm=TRUE + ) / low_fraction + ) + + list( + summary=comparison_summary, + df_compare=df_compare, + target_native_spe=target_native, + threshold=threshold, + low_fraction=low_fraction, + use_transfer_metrics=use_transfer_metrics + ) +} + +# Scatter plot of transferred QS vs native QS on the target dataset. +plot_transfer_vs_native_scatter <- function(comparison_result) { + ggplot2::ggplot( + comparison_result$df_compare, + ggplot2::aes(x=qc_native, y=qc_transfer) + ) + + ggplot2::geom_point(alpha=0.4, size=0.6, colour="#0072B2") + + ggplot2::geom_abline(intercept=0, slope=1, linetype=2, + colour="grey60") + + ggplot2::theme_bw() + + ggplot2::labs( + title="Transferred QS vs native QS on target", + subtitle=paste0( + "Spearman = ", + round(comparison_result$summary$spearman, 4), + ", agreement@", + comparison_result$threshold, + " = ", + round(comparison_result$summary$low_qc_agreement_at_threshold, 4) + ), + x="Native target QS", + y="Transferred QS" + ) +} + +# Train a transferable QS model on one dataset and apply it to another. +# +# Workflow: +# 1. read source and target datasets +# 2. compute QC metrics on both +# 3. identify the metric subset that exists in both datasets +# 4. train the ridge-logistic QS model on the source dataset using only those metrics +# 5. transfer the fitted coefficients to the target dataset +# 6. optionally evaluate the transfer on pseudo-reference labels in the target +transfer_qs_coefficients <- function( + source_dir, + target_dir, + source_technology=c("cosmx", "cosmx_protein", "xenium"), + target_technology=c("cosmx", "cosmx_protein", "xenium"), + source_sample_name="source_sample", + target_sample_name="target_sample", + threshold=0.5, + evaluate_target=TRUE, + verbose=TRUE +) { + source_technology <- match.arg(source_technology) + target_technology <- match.arg(target_technology) + + # Read the source dataset, from which coefficients will be learned. + source_spe <- read_dataset_for_transfer( + dir_name=source_dir, + technology=source_technology, + sample_name=source_sample_name + ) + + # Read the target dataset, on which coefficients will be transferred. + target_spe <- read_dataset_for_transfer( + dir_name=target_dir, + technology=target_technology, + sample_name=target_sample_name + ) + + # Harmonise per-cell QC metrics in both datasets. + source_spe <- prepare_dataset_for_transfer(source_spe) + target_spe <- prepare_dataset_for_transfer(target_spe) + + # Keep only the subset of QS metrics that is available in both datasets. + transfer_metrics <- get_transferable_metrics(source_spe, target_spe) + + if (!"log2SignalDensity" %in% transfer_metrics) { + stop("log2SignalDensity is required for transferable QS training") + } + if (length(transfer_metrics) < 2L) { + stop("At least two shared metrics are recommended for coefficient transfer") + } + + if (verbose) { + message("Transferable metrics: ", paste(transfer_metrics, collapse=", ")) + } + + # Compute source outliers only on the transferable metric subset. + source_spe <- computeOutliersQCScore(source_spe, metricList=transfer_metrics) + source_spe <- checkOutliers(source_spe, verbose=verbose) + + formula_vars <- S4Vectors::metadata(source_spe)$formula_variables + if (!"log2SignalDensity" %in% names(formula_vars)) { + stop("The transferable source formula lost log2SignalDensity after outlier filtering") + } + + # Build the compatible model formula and the balanced source training table. + model_formula <- getModelFormula(formula_vars, verbose=verbose) + source_train_df <- computeTrainDF( + colData=SummarizedExperiment::colData(source_spe), + formulaVars=formula_vars, + tech=S4Vectors::metadata(source_spe)$technology, + verbose=verbose + ) + + # Fit the ridge-logistic QS model on the source dataset. + x_source <- stats::model.matrix(stats::as.formula(model_formula), + data=source_train_df) + best_lambda <- computeLambda(source_train_df, model_formula) + fit <- trainModel(x_source, source_train_df) + coefficient_table <- extract_qs_coefficients(fit, best_lambda) + + # Score the full source and target datasets with the same transferred model. + source_spe <- score_target_dataset(source_spe, fit, best_lambda, model_formula) + target_spe <- score_target_dataset(target_spe, fit, best_lambda, model_formula) + + # Build the labelled source evaluation set from the source training examples. + source_eval <- merge( + data.frame(cell_id=source_spe$cell_id, QC_score=source_spe$QC_score), + source_train_df[, c("cell_id", "qcscore_train")], + by="cell_id" + ) + source_summary <- summarise_transfer_eval(source_eval, threshold=threshold) + + target_eval <- NULL + target_summary <- data.frame() + target_label_error <- NA_character_ + + if (isTRUE(evaluate_target)) { + # Build pseudo-reference labels independently on the target dataset, + # using the same transferable metric subset. + target_ref <- safe_reference_labels_transfer( + target_spe, + metric_list=transfer_metrics, + verbose=verbose + ) + target_label_error <- target_ref$error + + if (!is.null(target_ref$data)) { + target_eval <- merge( + data.frame(cell_id=target_spe$cell_id, QC_score=target_spe$QC_score), + target_ref$data, + by="cell_id" + ) + target_summary <- summarise_transfer_eval(target_eval, + threshold=threshold) + } + } + + list( + source_technology=source_technology, + target_technology=target_technology, + transfer_metrics=transfer_metrics, + model_formula=model_formula, + lambda=best_lambda, + coefficient_table=coefficient_table, + source_summary=source_summary, + target_summary=target_summary, + target_label_error=target_label_error, + source_train_df=source_train_df, + source_eval=source_eval, + target_eval=target_eval, + source_spe=source_spe, + target_spe=target_spe, + fit=fit + ) +} + +# Esempi di utilizzo: +# source("inst/scripts/transfer_qs_coefficients.R") +# +# Train su CosMx e applica a Xenium +# res_transfer_cx <- transfer_qs_coefficients( +# source_dir="/path/to/cosmx_dataset", +# source_technology="cosmx", +# target_dir="/path/to/xenium_dataset", +# target_technology="xenium", +# source_sample_name="CosMx_source", +# target_sample_name="Xenium_target" +# ) +# res_transfer_cx$coefficient_table +# res_transfer_cx$target_summary +# plot_transfer_labelled_cells(res_transfer_cx, dataset="target", size=0.08) +# plot_transfer_qs(res_transfer_cx, dataset="target", size=0.08) +# plot_transfer_roc(res_transfer_cx, dataset="target") +# plot_transfer_auc_summary(res_transfer_cx) +# +# cmp_transfer_cx <- compare_transferred_vs_native( +# res_transfer_cx, +# threshold=0.5, +# low_fraction=0.1 +# ) +# cmp_transfer_cx$summary +# plot_transfer_vs_native_scatter(cmp_transfer_cx) +# +# Train su Xenium e applica a CosMx +# res_transfer_xc <- transfer_qs_coefficients( +# source_dir="/path/to/xenium_dataset", +# source_technology="xenium", +# target_dir="/path/to/cosmx_dataset", +# target_technology="cosmx" +# ) +# res_transfer_xc$coefficient_table +# res_transfer_xc$target_summary +# res_transfer_xc$target_summary From a4bdc6c997c8f8a43e83837357de424e6236b204 Mon Sep 17 00:00:00 2001 From: Dario Date: Fri, 15 May 2026 16:13:02 +0200 Subject: [PATCH 03/10] adding functions for model transfer across datasets --- R/QC.R | 243 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 225 insertions(+), 18 deletions(-) diff --git a/R/QC.R b/R/QC.R index 14ef5ed..6d69efd 100644 --- a/R/QC.R +++ b/R/QC.R @@ -370,7 +370,17 @@ computeThresholdFlags <- function(spe, totalThreshold=0, #' @export computeLambda <- function(trainDF, modelFormula) { # model_formula <- getModelFormula(technology) + train_ok <- .filterCompleteModelCases( + df=trainDF, + modelFormula=modelFormula, + response=trainDF$qcscore_train, + context="training cells for lambda selection" + ) + + trainDF <- trainDF[train_ok, , drop=FALSE] + model_matrix <- model.matrix(as.formula(modelFormula), data=trainDF) + model_matrix <- .dropModelIntercept(model_matrix) ridge_cv <- cv.glmnet(model_matrix, trainDF$qcscore_train, family="binomial", alpha=0, lambda=NULL) bestLambda <- ridge_cv$lambda.min @@ -442,7 +452,7 @@ computeLambda <- function(trainDF, modelFormula) { #' set.seed(1998) #' spe <- computeQCScore(spe) #' summary(spe$QC_score) -computeQCScore <- function(spe, bestLambda=NULL, verbose=FALSE) { +computeQCScore <- function(spe, bestLambda=NULL, modelFormula=NULL, verbose=FALSE) { stopifnot(is(spe, "SpatialExperiment")) if (dim(spe[,spe$total == 0])[2] != 0) { warning(paste0(dim(spe[,spe$total == 0])[2], @@ -458,34 +468,71 @@ computeQCScore <- function(spe, bestLambda=NULL, verbose=FALSE) { train_df <- computeTrainDF(df, out_var, tech, verbose) - model_formula <- getModelFormula(out_var, verbose) + if (is.null(modelFormula)) { + model_formula <- getModelFormula(out_var, verbose) + } else { + model_formula <- modelFormula + if (verbose) { + message("Using user-supplied model formula:") + message(model_formula) + } + } + train_ok <- .filterCompleteModelCases( + df=train_df, + modelFormula=model_formula, + response=train_df$qcscore_train, + context="training cells" + ) + + train_df <- train_df[train_ok, , drop=FALSE] model_matrix <- model.matrix(as.formula(model_formula), data=train_df) + model_matrix <- .dropModelIntercept(model_matrix) model <- trainModel(model_matrix, train_df) if(is.null(bestLambda)) { bestLambda <- computeLambda(train_df, model_formula) } + coefs <- coef(model, s=bestLambda) + if (verbose) { message("Model coefficients for every term used in the formula:") - message(paste(round(predict(model, s=bestLambda, type="coefficients"), - 2), - collapse = " ")) + message( paste( rownames(coefs), round(as.numeric(coefs), 2), sep="=", + collapse=" ")) } - full_matrix <- model.matrix(as.formula(model_formula), data=df) - cd <- colData(spe) - cd$QC_score <- as.vector(predict(model, s=bestLambda, - newx = full_matrix, - type = "response")) - spe$QC_score <- cd$QC_score - # train_identity <- rep("TEST", dim(spe)[2]) - # train_bad <- train_df$cell_id[train_df$qcscore_train==0] - # train_good <- train_df$cell_id[train_df$qcscore_train==1] - # spe$training_status <- dplyr::case_when( - # spe$cell_id %in% train_bad ~ "BAD", - # spe$cell_id %in% train_good ~ "GOOD", - # TRUE ~ train_identity) + full_ok <- .filterCompleteModelCases(df=df, modelFormula=model_formula, + context="cells") + + full_matrix <- model.matrix(as.formula(model_formula), + data=df[full_ok, , drop=FALSE]) + + full_matrix <- .dropModelIntercept(full_matrix) + full_matrix <- full_matrix[, colnames(model_matrix), drop=FALSE] + + ## NAs may come from model variables used in `model_formula`, e.g. cells with + ## missing `log2AspectRatio` when aspect ratio cannot be computed from polygons. + ## Predict only complete cases; incomplete cells keep QC_score = NA. + qc_score <- rep(NA_real_, nrow(df)) + qc_score[full_ok] <- as.vector( + predict(model, s=bestLambda, newx=full_matrix, type="response") + ) + + spe$QC_score <- qc_score + + metadata(spe)$QCScore_model <- list( + model=model, + bestLambda=bestLambda, + model_formula=model_formula, + formula_variables=out_var, + model_matrix_colnames=colnames(model_matrix), + coefficients=coefs, + coefficients_table=data.frame( + term=rownames(coefs), + coefficient=as.numeric(coefs), + row.names=NULL + ) + ) return(spe) } @@ -1148,3 +1195,163 @@ checkOutliers <- function(spe, verbose=FALSE) { return(list(df=df, out_var=out_var, tech=tech)) } +#' applyQCScoreModel +#' +#' @description +#' Apply a previously trained QC score model to a new SpatialExperiment object. +#' +#' @param spe A `SpatialExperiment` object with QC metrics already computed. +#' @param qcModel A QC score model object, usually stored in +#' `metadata(spe)$QCScore_model`. +#' @param scoreName Name of the output column in `colData`. +#' +#' @return A `SpatialExperiment` object with added QC score in `colData`. +#' +#' @export +#' @examples +#' +#' example(readCosmxSPE) +#' +#' ## Compute per-cell quality control metrics +#' spe <- spatialPerCellQC(spe) +#' +#' ## Split the object only for example purposes +#' set.seed(1998) +#' idx <- sample(seq_len(ncol(spe)), floor(ncol(spe) / 2)) +#' spe_train <- spe[, idx] +#' spe_test <- spe[, -idx] +#' +#' ## Train the Quality Control (QC) score model on one dataset +#' spe_train <- computeQCScore(spe_train) +#' qc_model <- metadata(spe_train)$QCScore_model +#' +#' ## Apply the trained model to another dataset +#' spe_test <- applyQCScoreModel( +#' spe=spe_test, +#' qcModel=qc_model, +#' scoreName="QC_score_transferred" +#' ) +#' +#' summary(spe_test$QC_score_transferred) +applyQCScoreModel <- function(spe, qcModel, scoreName="QC_score") { + stopifnot(is(spe, "SpatialExperiment")) + + required <- c( + "model", + "bestLambda", + "model_formula", + "model_matrix_colnames" + ) + + stopifnot( + "qcModel is missing required fields" = + all(required %in% names(qcModel)) + ) + + df <- as.data.frame(colData(spe)) + + ok <- .filterCompleteModelCases( + df=df, + modelFormula=qcModel$model_formula, + context="cells" + ) + + new_matrix <- model.matrix( + as.formula(qcModel$model_formula), + data=df[ok, , drop=FALSE] + ) + + new_matrix <- .dropModelIntercept(new_matrix) + + missing_cols <- setdiff( + qcModel$model_matrix_colnames, + colnames(new_matrix) + ) + + if (length(missing_cols) > 0L) { + zero_matrix <- matrix( + 0, + nrow=nrow(new_matrix), + ncol=length(missing_cols), + dimnames=list(rownames(new_matrix), missing_cols) + ) + + new_matrix <- cbind(new_matrix, zero_matrix) + } + + extra_cols <- setdiff( + colnames(new_matrix), + qcModel$model_matrix_colnames + ) + + if (length(extra_cols) > 0L) { + warning( + "New model matrix contains columns not present in the ", + "training matrix. These columns will be ignored: ", + paste(extra_cols, collapse=", ") + ) + } + + new_matrix <- new_matrix[, qcModel$model_matrix_colnames, drop=FALSE] + + score <- rep(NA_real_, nrow(df)) + + score[ok] <- as.vector( + predict( + qcModel$model, + s=qcModel$bestLambda, + newx=new_matrix, + type="response" + ) + ) + + cd <- colData(spe) + cd[[scoreName]] <- score + colData(spe) <- cd + + metadata(spe)$QCScore_model_applied <- qcModel + + return(spe) +} + +.dropModelIntercept <- function(modelMatrix) { + if ("(Intercept)" %in% colnames(modelMatrix)) { + modelMatrix <- modelMatrix[, colnames(modelMatrix)!="(Intercept)", + drop=FALSE] + } + + return(modelMatrix) +} + +.filterCompleteModelCases <- function(df, modelFormula, response=NULL, + context="cells") { + + vars <- all.vars(as.formula(modelFormula)) + + missing_vars <- setdiff(vars, colnames(df)) + + if (length(missing_vars) > 0L) { + stop( + "Missing variables required by model formula: ", + paste(missing_vars, collapse=", ") + ) + } + ## Missing values may come from variables used in `modelFormula`, for + ## example `log2AspectRatio` when aspect ratio cannot be computed from + ## polygons. + ok <- complete.cases(df[, vars, drop=FALSE]) + + if (!is.null(response)) { + ok <- ok & !is.na(response) + } + + if (!all(ok)) { + warning( + sum(!ok), + " ", context, + " contain missing model variables." + ) + } + + return(ok) +} From 8aeee801dc2e480af7be5867e8499013ab06102b Mon Sep 17 00:00:00 2001 From: Dario Date: Fri, 15 May 2026 16:13:13 +0200 Subject: [PATCH 04/10] script for model transfer --- inst/scripts/transfer_coeffs.R | 126 +++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 inst/scripts/transfer_coeffs.R diff --git a/inst/scripts/transfer_coeffs.R b/inst/scripts/transfer_coeffs.R new file mode 100644 index 0000000..19812b4 --- /dev/null +++ b/inst/scripts/transfer_coeffs.R @@ -0,0 +1,126 @@ +library(SpaceTrooper) +library(SummarizedExperiment) +library(S4Vectors) + +set.seed(1998) + +common_formula <- "~(log2SignalDensity + Area_um + log2AspectRatio)^2" + +## ----------------------------- +## Read Xenium data +## ----------------------------- + +dbkx_path <- "~/Downloads/Xenium_data/db_kero_xen" + +spexen <- readXeniumSPE(dbkx_path) + +spexen <- spatialPerCellQC(spexen) + +spexen <- computeQCScore( + spexen, + modelFormula=common_formula, + verbose=TRUE +) + +xen_model <- metadata(spexen)$QCScore_model + + +## ----------------------------- +## Read CosMx data +## ----------------------------- + +cosm_path <- "~/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2/" + +specosm <- readCosmxSPE(cosm_path) + +specosm <- spatialPerCellQC(specosm) + +specosm <- computeQCScore( + specosm, + modelFormula=common_formula, + verbose=TRUE +) + +cosmx_model <- metadata(specosm)$QCScore_model + + +## ----------------------------- +## Save native scores before transfer +## ----------------------------- + +spexen$QC_score_xenium_native <- spexen$QC_score +specosm$QC_score_cosmx_native <- specosm$QC_score + + +## ----------------------------- +## Apply transferred models +## ----------------------------- + +specosm <- applyQCScoreModel( + spe=specosm, + qcModel=xen_model, + scoreName="QC_score_xenium_model" +) + +spexen <- applyQCScoreModel( + spe=spexen, + qcModel=cosmx_model, + scoreName="QC_score_cosmx_model" +) + + +## ----------------------------- +## Summaries +## ----------------------------- + +message("Xenium native score:") +print(summary(spexen$QC_score_xenium_native)) + +message("Xenium scored with CosMx model:") +print(summary(spexen$QC_score_cosmx_model)) + +message("CosMx native score:") +print(summary(specosm$QC_score_cosmx_native)) + +message("CosMx scored with Xenium model:") +print(summary(specosm$QC_score_xenium_model)) + + +## ----------------------------- +## Correlations +## ----------------------------- + +message("Correlation in Xenium: native vs CosMx model") +print( + cor( + spexen$QC_score_xenium_native, + spexen$QC_score_cosmx_model, + use="complete.obs" + ) +) + +message("Correlation in CosMx: native vs Xenium model") +print( + cor( + specosm$QC_score_cosmx_native, + specosm$QC_score_xenium_model, + use="complete.obs" + ) +) + + +## ----------------------------- +## Model details +## ----------------------------- + +message("Xenium model formula:") +print(metadata(spexen)$QCScore_model$model_formula) + +message("CosMx model formula:") +print(metadata(specosm)$QCScore_model$model_formula) + +message("Xenium model coefficients:") +print(metadata(spexen)$QCScore_model$coefficients_table) + +message("CosMx model coefficients:") +print(metadata(specosm)$QCScore_model$coefficients_table) From c0cf042355c3f3cc610199b34350af8742aea875 Mon Sep 17 00:00:00 2001 From: Dario Date: Wed, 20 May 2026 15:40:37 +0200 Subject: [PATCH 05/10] adding scripts for reviewing process --- R/plotUtils.R | 54 ++ inst/scripts/ablation_study.R | 484 ++++++++++++ inst/scripts/compare_qs_general.R | 79 ++ inst/scripts/create_cosmx_rds.R | 39 + inst/scripts/run_qs_general.R | 275 +++++++ inst/scripts/transfer_coeffs_cosmx_cosmx.R | 730 ++++++++++++++++++ ...oeffs.R => transfer_coeffs_cosmx_xenium.R} | 82 +- inst/scripts/transfer_coeffs_xenium_xenium.R | 348 +++++++++ 8 files changed, 2087 insertions(+), 4 deletions(-) create mode 100644 inst/scripts/ablation_study.R create mode 100644 inst/scripts/compare_qs_general.R create mode 100644 inst/scripts/create_cosmx_rds.R create mode 100644 inst/scripts/run_qs_general.R create mode 100644 inst/scripts/transfer_coeffs_cosmx_cosmx.R rename inst/scripts/{transfer_coeffs.R => transfer_coeffs_cosmx_xenium.R} (56%) create mode 100644 inst/scripts/transfer_coeffs_xenium_xenium.R diff --git a/R/plotUtils.R b/R/plotUtils.R index 21d6a8c..9fe56bd 100644 --- a/R/plotUtils.R +++ b/R/plotUtils.R @@ -200,3 +200,57 @@ firstFlagPalette <- c( "> logged aspect ratio higher thr." = "greenyellow" ) +# To be decided if this should be moved to a separate file or included in the main package +# plot_qc_score_comparison <- function(x, y, x_label="x", y_label="y", +# title="QC score comparison", threshold=0.75, +# cor_method="spearman", output_file=NULL) { + +# stopifnot(length(x) == length(y)) + +# df <- data.frame(x=x, y=y) + +# df$quadrant <- with(df, ifelse( +# x < threshold & y < threshold, "low / low", +# ifelse(x >= threshold & y < threshold, "high x / low y", +# ifelse(x < threshold & y >= threshold, "low x / high y", +# "high / high")) +# )) + +# cor_val <- cor(df$x, df$y, use="complete.obs", method=cor_method) + +# p <- ggplot(df, aes(x=x, y=y, color=quadrant)) + +# geom_point(alpha=0.35, size=0.6) + +# geom_abline(slope=1, intercept=0, linetype="dashed", color="red") + +# geom_hline(yintercept=threshold) + +# geom_vline(xintercept=threshold) + +# scale_color_manual(values=c( +# "low / low"="grey60", +# "high x / low y"="orange", +# "low x / high y"="dodgerblue", +# "high / high"="black" +# )) + +# labs(x=x_label, y=y_label, title=title, color="Quadrant") + +# theme_minimal() + +# annotate( +# "text", +# x=quantile(df$x, 0.99, na.rm=TRUE), +# y=quantile(df$y, 0.01, na.rm=TRUE), +# label=paste0(cor_method, " r = ", +# formatC(cor_val, digits=3, format="f")), +# hjust=1, +# vjust=0, +# color="black" +# ) + +# if (!is.null(output_file)) { +# ggsave(output_file, p, device="pdf", +# width=11.69, height=8.27, units="in") +# } + +# return(list( +# plot=p, +# correlation=cor_val, +# quadrant_table=table(df$quadrant) +# )) +# } + diff --git a/inst/scripts/ablation_study.R b/inst/scripts/ablation_study.R new file mode 100644 index 0000000..9cd3777 --- /dev/null +++ b/inst/scripts/ablation_study.R @@ -0,0 +1,484 @@ +## ============================================================ +## 0. Libraries and output directory +## ============================================================ + +library(SpaceTrooper) +library(ggplot2) + +out_dir <- "~/SpaceTrooper_qs_check" + +if (!dir.exists(out_dir)) { + dir.create(out_dir, recursive=TRUE) +} + + +## ============================================================ +## 1. Define ablation formulas +## ============================================================ +## The full model contains all Quality Score (QS) components. +## Each ablated model removes one component at a time. + +formulas <- list( + full="~(log2SignalDensity + Area_um + log2AspectRatio + log2Ctrl_total_ratio)^2", + + no_log2SignalDensity= + "~(Area_um + log2AspectRatio + log2Ctrl_total_ratio)^2", + + no_Area_um= + "~(log2SignalDensity + log2AspectRatio + log2Ctrl_total_ratio)^2", + + no_log2AspectRatio= + "~(log2SignalDensity + Area_um + log2Ctrl_total_ratio)^2", + + no_log2Ctrl_total_ratio= + "~(log2SignalDensity + Area_um + log2AspectRatio)^2" +) + + +## ============================================================ +## 2. Function to run QS ablation models +## ============================================================ +## For each formula, computeQCScore() is run with a forced model formula. +## The output stores: +## - the computed QS vector +## - the trained QS model stored in metadata(spe)$QCScore_model + +run_qs_ablation <- function(spe, formulas, verbose=FALSE) { + out <- list() + + for (nm in names(formulas)) { + message("Running model: ", nm) + + spe_i <- computeQCScore( + spe=spe, + modelFormula=formulas[[nm]], + verbose=verbose + ) + + out[[nm]] <- list( + spe=spe_i, + score=spe_i$QC_score, + model=metadata(spe_i)$QCScore_model + ) + } + + return(out) +} + + +## ============================================================ +## 3. Read and preprocess CosMx data +## ============================================================ +## The object is read, then per-cell QC metrics are computed. +## These metrics are required by computeQCScore(). + +cosm_path <- "~/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2/" + +specosm <- readCosmxSPE(cosm_path) + +specosm <- spatialPerCellQC(specosm) + + +## ============================================================ +## 4. Run ablation study on CosMx +## ============================================================ +## This computes the full QS and one QS for each ablated formula. + +set.seed(1998) + +abl_cosmx <- run_qs_ablation( + spe=specosm, + formulas=formulas, + verbose=TRUE +) + + +## ============================================================ +## 5. Functions to summarize ablation results +## ============================================================ +## bottom_overlap() compares the low-QS tails of two score vectors. +## By default q=0.10 means the bottom 10% of cells. +## +## summarize_qs_ablation() compares every ablated QS against the full QS +## using: +## - Pearson correlation +## - Spearman correlation +## - absolute score differences +## - overlap of bottom-tail low-quality cells + +bottom_overlap <- function(x, y, q=0.10) { + x_bad <- x <= quantile(x, probs=q, na.rm=TRUE) + y_bad <- y <= quantile(y, probs=q, na.rm=TRUE) + + overlap <- sum(x_bad & y_bad, na.rm=TRUE) + union <- sum(x_bad | y_bad, na.rm=TRUE) + + data.frame( + quantile=q, + overlap=overlap, + union=union, + jaccard=overlap / union, + prop_ref_recovered=overlap / sum(x_bad, na.rm=TRUE) + ) +} + + +summarize_qs_ablation <- function(ablation_results, reference="full", q=0.10) { + ref_score <- ablation_results[[reference]]$score + + res <- lapply(names(ablation_results), function(nm) { + score <- ablation_results[[nm]]$score + + ov <- bottom_overlap(ref_score, score, q=q) + + data.frame( + model=nm, + pearson=cor( + ref_score, + score, + use="complete.obs", + method="pearson" + ), + spearman=cor( + ref_score, + score, + use="complete.obs", + method="spearman" + ), + mean_abs_diff=mean(abs(ref_score - score), na.rm=TRUE), + median_abs_diff=median(abs(ref_score - score), na.rm=TRUE), + max_abs_diff=max(abs(ref_score - score), na.rm=TRUE), + bottom_jaccard=ov$jaccard, + bottom_ref_recovered=ov$prop_ref_recovered, + n_na=sum(is.na(score)), + stringsAsFactors=FALSE + ) + }) + + do.call(rbind, res) +} + + +## ============================================================ +## 6. Build ablation summary table +## ============================================================ + +tab_cosmx <- summarize_qs_ablation( + ablation_results=abl_cosmx, + reference="full", + q=0.10 +) + +tab_cosmx + + +## ============================================================ +## 7. Pairwise plots: full QS vs each ablated QS +## ============================================================ +## This assumes plot_qc_score_comparison() is already available. +## Each plot is also saved as an A4 landscape PDF. + +ablation_names <- setdiff(names(abl_cosmx), "full") + +plots_cosmx <- lapply(ablation_names, function(nm) { + plot_qc_score_comparison( + x=abl_cosmx$full$score, + y=abl_cosmx[[nm]]$score, + x_label="Full QS", + y_label=paste0("QS: ", nm), + title=paste0("CosMx: full QS vs ", nm), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("cosmx_full_vs_", nm, "_A4_landscape.pdf") + ) + ) +}) + +names(plots_cosmx) <- ablation_names + +## Print plots +lapply(plots_cosmx, function(x) print(x$plot)) + + +## ============================================================ +## 8. Heatmap: complete ablation summary +## ============================================================ +## This heatmap includes correlations, score differences, and bottom-tail +## overlap metrics. + +heat_df <- tab_cosmx[tab_cosmx$model != "full", ] + +heat_df <- heat_df[, c( + "model", + "pearson", + "spearman", + "mean_abs_diff", + "median_abs_diff", + "bottom_jaccard", + "bottom_ref_recovered" +)] + +heat_long <- reshape( + heat_df, + varying=names(heat_df)[names(heat_df) != "model"], + v.names="value", + timevar="metric", + times=names(heat_df)[names(heat_df) != "model"], + direction="long" +) + +rownames(heat_long) <- NULL + +p_heat <- ggplot( + heat_long, + aes(x=metric, y=model, fill=value) +) + + geom_tile() + + geom_text( + aes(label=formatC(value, digits=3, format="f")), + color="white", + size=3 + ) + + labs( + title="CosMx QS ablation summary", + x="Metric", + y="Ablation model", + fill="Value" + ) + + theme_minimal() + + theme( + axis.text.x=element_text(angle=45, hjust=1) + ) + +p_heat + +ggsave( + filename=file.path( + out_dir, + "cosmx_ablation_summary_heatmap_A4_landscape.pdf" + ), + plot=p_heat, + device="pdf", + width=11.69, + height=8.27, + units="in" +) + + +## ============================================================ +## 9. Heatmap: correlation-only summary +## ============================================================ +## Cleaner figure showing only Pearson and Spearman correlations +## between the full QS and each ablated QS. + +cor_heat_df <- tab_cosmx[ + tab_cosmx$model != "full", + c("model", "pearson", "spearman") +] + +cor_heat_long <- reshape( + cor_heat_df, + varying=c("pearson", "spearman"), + v.names="correlation", + timevar="metric", + times=c("pearson", "spearman"), + direction="long" +) + +rownames(cor_heat_long) <- NULL + +p_cor_heat <- ggplot( + cor_heat_long, + aes(x=metric, y=model, fill=correlation) +) + + geom_tile() + + geom_text( + aes(label=formatC(correlation, digits=3, format="f")), + color="white", + size=4 + ) + + labs( + title="CosMx QS ablation correlation with full model", + x="Correlation metric", + y="Ablation model", + fill="Correlation" + ) + + theme_minimal() + +p_cor_heat + +ggsave( + filename=file.path( + out_dir, + "cosmx_ablation_correlation_heatmap.pdf" + ), + plot=p_cor_heat, + device="pdf", + width=8, + height=5, + units="in" +) + + +## ============================================================ +## 10. Optional: save tabular results +## ============================================================ + +write.csv( + tab_cosmx, + file=file.path(out_dir, "cosmx_ablation_summary_table.csv"), + row.names=FALSE +) + + +plot_ablation_colored <- function(ablation_results, ablation_name, + reference="full", color_col=NULL, + x_label="Full QS", y_label=NULL, title=NULL, + cor_method="spearman", output_file=NULL) { + + stopifnot(reference %in% names(ablation_results)) + stopifnot(ablation_name %in% names(ablation_results)) + + ref_score <- ablation_results[[reference]]$score + abl_score <- ablation_results[[ablation_name]]$score + + stopifnot(length(ref_score) == length(abl_score)) + + spe_ref <- ablation_results[[reference]]$spe + cd <- as.data.frame(colData(spe_ref)) + + df <- data.frame( + full_qs=ref_score, + ablated_qs=abl_score + ) + + if (!is.null(color_col)) { + stopifnot(color_col %in% colnames(cd)) + df$color_var <- cd[[color_col]] + } + + cor_val <- cor( + df$full_qs, + df$ablated_qs, + use="complete.obs", + method=cor_method + ) + + if (is.null(y_label)) { + y_label <- paste0("QS: ", ablation_name) + } + + if (is.null(title)) { + title <- paste0("Full QS vs ", ablation_name) + } + + if (is.null(color_col)) { + p <- ggplot(df, aes(x=full_qs, y=ablated_qs)) + + geom_point(alpha=0.35, size=0.6) + } else { + p <- ggplot(df, aes(x=full_qs, y=ablated_qs, color=color_var)) + + geom_point(alpha=0.35, size=0.6) + } + + p <- p + + geom_abline( + slope=1, + intercept=0, + linetype="dashed", + color="red" + ) + + labs( + x=x_label, + y=y_label, + title=title, + color=color_col + ) + + theme_minimal() + + annotate( + "text", + x=quantile(df$full_qs, 0.99, na.rm=TRUE), + y=quantile(df$ablated_qs, 0.01, na.rm=TRUE), + label=paste0( + cor_method, + " r = ", + formatC(cor_val, digits=3, format="f") + ), + hjust=1, + vjust=0, + color="black" + ) + + if (!is.null(output_file)) { + ggsave( + filename=output_file, + plot=p, + device="pdf", + width=11.69, + height=8.27, + units="in" + ) + } + + return(list( + plot=p, + correlation=cor_val, + data=df + )) +} + +p_no_signal_col <- plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2SignalDensity", + color_col="log2Ctrl_total_ratio", + title="CosMx: full QS vs QS without log2SignalDensity", + y_label="QS without log2SignalDensity", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2SignalDensity_log2Ctrl_total_ratiocolored.pdf" +) +p_no_signal_col_area <- plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2SignalDensity", + color_col="Area_um", + title="CosMx: full QS vs QS without log2SignalDensity", + y_label="QS without log2SignalDensity", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2SignalDensity_Area_um_colored.pdf" +) + +p_no_signal_col_aspect <- plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2SignalDensity", + color_col="log2AspectRatio", + title="CosMx: full QS vs QS without log2SignalDensity", + y_label="QS without log2SignalDensity", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2SignalDensity_log2AspectRatio_colored.pdf" +) + +p_no_signal_col_aspect$plot +p_no_signal_col$plot + +plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2Ctrl_total_ratio", + color_col="log2SignalDensity", + title="CosMx: full QS vs QS without log2Ctrl_total_ratio", + y_label="QS without log2Ctrl_total_ratio", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2ctrl_total_ratio_log2SignalDensitycolored.pdf" +)$plot + +plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2Ctrl_total_ratio", + color_col="Area_um", + title="CosMx: full QS vs QS without log2Ctrl_total_ratio", + y_label="QS without log2Ctrl_total_ratio", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2Ctrl_total_ratio_Area_um_colored.pdf" +)$plot +plot_ablation_colored( + ablation_results=abl_cosmx, + ablation_name="no_log2Ctrl_total_ratio", + color_col="log2AspectRatio", + title="CosMx: full QS vs QS without log2Ctrl_total_ratio", + y_label="QS without log2Ctrl_total_ratio", + output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2Ctrl_total_ratio_log2AspectRatio_colored.pdf" +)$plot diff --git a/inst/scripts/compare_qs_general.R b/inst/scripts/compare_qs_general.R new file mode 100644 index 0000000..60dd790 --- /dev/null +++ b/inst/scripts/compare_qs_general.R @@ -0,0 +1,79 @@ +args <- commandArgs(trailingOnly=TRUE) + +if (length(args) != 2) { + stop("Usage: Rscript compare_qs_general.R before.rds current.rds") +} + +before <- readRDS(args[[1]]) +current <- readRDS(args[[2]]) + +qs_before <- before$quality_score +qs_current <- current$quality_score + +cat("BEFORE\n") +cat("Branch:", before$git_branch, "\n") +cat("Commit:", before$git_hash, "\n") +cat("QScore column:", before$qscore_column, "\n") +cat("Detection:", before$qscore_detection_method, "\n\n") + +cat("CURRENT\n") +cat("Branch:", current$git_branch, "\n") +cat("Commit:", current$git_hash, "\n") +cat("QScore column:", current$qscore_column, "\n") +cat("Detection:", current$qscore_detection_method, "\n\n") + +cat("Lengths:\n") +cat("before:", length(qs_before), "\n") +cat("current:", length(qs_current), "\n\n") + +if (length(qs_before) != length(qs_current)) { + stop("Different number of Quality Score values. Cannot compare directly.") +} + +comparison <- data.frame( + index=seq_along(qs_before), + quality_score_before=qs_before, + quality_score_current=qs_current, + difference=qs_current - qs_before +) + +cat("Summary before:\n") +print(summary(qs_before)) + +cat("\nSummary current:\n") +print(summary(qs_current)) + +cat("\nSummary difference current - before:\n") +print(summary(comparison$difference)) + +cat("\nall.equal:\n") +print(all.equal(qs_before, qs_current)) + +cat("\nExactly different values:\n") +print(sum(qs_before != qs_current, na.rm=TRUE)) + +cat("\nDifferent above tolerance 1e-8:\n") +print(sum(abs(qs_current - qs_before) > 1e-8, na.rm=TRUE)) + +cat("\nDifferent above tolerance 1e-6:\n") +print(sum(abs(qs_current - qs_before) > 1e-6, na.rm=TRUE)) + +cat("\nDifferent above tolerance 1e-4:\n") +print(sum(abs(qs_current - qs_before) > 1e-4, na.rm=TRUE)) + +out_csv <- file.path( + dirname(args[[2]]), + "quality_score_comparison.csv" +) + +out_rds <- file.path( + dirname(args[[2]]), + "quality_score_comparison.rds" +) + +write.csv(comparison, out_csv, row.names=FALSE) +saveRDS(comparison, out_rds) + +cat("\nSaved:\n") +cat(out_csv, "\n") +cat(out_rds, "\n") diff --git a/inst/scripts/create_cosmx_rds.R b/inst/scripts/create_cosmx_rds.R new file mode 100644 index 0000000..152d9f8 --- /dev/null +++ b/inst/scripts/create_cosmx_rds.R @@ -0,0 +1,39 @@ +suppressPackageStartupMessages({ + library(devtools) + library(SpatialExperiment) + library(SummarizedExperiment) +}) + + +input_dir <- "/Users/inzirio/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2" + +output_rds <- "~/SpaceTrooper_qs_check/cosmx_case2_spe_raw.rds" +output_rds <- path.expand(output_rds) + + +library(SpaceTrooper) + +if (!dir.exists(input_dir)) { + stop("Input directory does not exist: ", input_dir) +} + +message("Reading CosMx dataset from:") +message(input_dir) + +spe <- readCosmxSPE(input_dir) + +message("Object class:") +print(class(spe)) + +message("Object dimensions:") +print(dim(spe)) + +message("colData columns:") +print(colnames(SummarizedExperiment::colData(spe))) + +message("Saving RDS to:") +message(output_rds) + +saveRDS(spe, output_rds) + +message("Done.") diff --git a/inst/scripts/run_qs_general.R b/inst/scripts/run_qs_general.R new file mode 100644 index 0000000..607cf52 --- /dev/null +++ b/inst/scripts/run_qs_general.R @@ -0,0 +1,275 @@ + +package_path <- "/Users/inzirio/My Drive/works/coding/SpaceTrooper" +input_rds <- "~/SpaceTrooper_qs_check/cosmx_case2_spe_raw.rds" +output_rds <- "~/SpaceTrooper_qs_check/cosmx_case2_qs_result_interaction.rds" + + +message("Package path: ", package_path) +message("Input RDS: ", input_rds) +message("Output RDS: ", output_rds) + +library(devtools) +library(SpatialExperiment) + +devtools::load_all(package_path, quiet=FALSE) + + +obj <- readRDS(input_rds) +metadata(obj)$formula_variables <- c(log2CountArea="log2CountArea", + log2AspectRatio="log2AspectRatio") +obj <- spatialPerCellQC(obj) + +metadata(obj) +obj <- computeQCScore(obj) + +qc_nointeract <- obj$QC_score +qc_interact <- obj$QC_score +saveRDS(qc_nointeract, "~/SpaceTrooper_qs_check/qc_nointeract.rds") +saveRDS(qc_interact, "~/SpaceTrooper_qs_check/qc_interact.rds") + +library(ggplot2) + +# assume qc_nointeract and qc_interact exist in the environment +df <- data.frame(qc_nointeract = qc_nointeract, qc_interact = qc_interact) + +# correlation +cor_val <- cor(df$qc_nointeract, df$qc_interact, use = "complete.obs", method = "pearson") + +# plot: use 2D binning for large datasets, points otherwise; add y=x line and annotation +plot_qc <- ggplot(df, aes(x = qc_nointeract, y = qc_interact)) + + geom_point(alpha = 0.35, size = 0.6) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") + +# scale_fill_gradient(low = "white", high = "steelblue") + + labs( + x = "QC score (no interaction)", + y = "QC score (with interaction)", + title = "QC: no-interaction vs interaction" + ) + + theme_minimal() + + geom_hline(yintercept = 0.75) + + geom_vline(xintercept = 0.75) + + annotate( + "text", + x = quantile(df$qc_nointeract, 0.99, na.rm = TRUE), + y = quantile(df$qc_interact, 0.01, na.rm = TRUE), + label = paste0("r = ", formatC(cor_val, digits = 3, format = "f")), + hjust = 1, vjust = 0 + ) + +# return the plot object +plot_qc + + +library(ggplot2) + +df <- data.frame( + qc_nointeract=qc_nointeract, + qc_interact=qc_interact +) + +threshold <- 0.75 + +df$quadrant <- with( + df, + ifelse( + qc_nointeract < threshold & qc_interact < threshold, + "low / low", + ifelse( + qc_nointeract >= threshold & qc_interact < threshold, + "high no-interaction / low interaction", + ifelse( + qc_nointeract < threshold & qc_interact >= threshold, + "low no-interaction / high interaction", + "high / high" + ) + ) + ) +) + +table(df$quadrant) + +cor_val <- cor( + df$qc_nointeract, + df$qc_interact, + use="complete.obs", + method="pearson" +) + +plot_qc <- ggplot(df, aes(x=qc_nointeract, y=qc_interact, color=quadrant)) + + geom_point(alpha=0.35, size=0.6) + + geom_abline( + slope=1, + intercept=0, + linetype="dashed", + color="red" + ) + + geom_hline(yintercept=threshold) + + geom_vline(xintercept=threshold) + + labs( + x="QC score (no interaction)", + y="QC score (with interaction)", + title="QC: no-interaction vs interaction", + color="Quadrant" + ) + + theme_minimal() + + annotate( + "text", + x=quantile(df$qc_nointeract, 0.99, na.rm=TRUE), + y=quantile(df$qc_interact, 0.01, na.rm=TRUE), + label=paste0("r = ", formatC(cor_val, digits=3, format="f")), + hjust=1, + vjust=0, + color="black" + ) + +plot_qc + +plot_qc <- plot_qc + + scale_color_manual( + values=c( + "low / low"="grey60", + "high no-interaction / low interaction"="orange", + "low no-interaction / high interaction"="dodgerblue", + "high / high"="black" + ) + ) + +ggsave( + filename="~/SpaceTrooper_qs_check/qc_nointeraction_vs_interaction_A4_landscape.pdf", + plot=plot_qc, + device="pdf", + width=11.69, + height=8.27, + units="in" +) +# get_coldata_df <- function(x) { +# cd <- SummarizedExperiment::colData(x) +# as.data.frame(cd) +# } + +# get_qscore_from_coldata <- function(x) { +# cd <- get_coldata_df(x) + +# candidate_names <- c( +# "quality_score", +# "QualityScore", +# "qualityScore", +# "qscore", +# "QScore", +# "Q_score", +# "q_score", +# "score", +# "flag_score", +# "qc_score", +# "QC_score", +# "QCScore", +# "cell_quality_score", +# "spatial_quality_score" +# ) + +# exact_hits <- intersect(candidate_names, colnames(cd)) + +# if (length(exact_hits) > 0) { +# selected <- exact_hits[1] +# return(list( +# values=cd[[selected]], +# column=selected, +# method="exact_name_match", +# all_candidates=exact_hits, +# coldata=cd +# )) +# } + +# pattern_hits <- grep( +# "quality|qscore|q_score|qc_score|flag_score|score", +# colnames(cd), +# ignore.case=TRUE, +# value=TRUE +# ) + +# numeric_pattern_hits <- pattern_hits[ +# vapply(cd[pattern_hits], is.numeric, logical(1)) +# ] + +# if (length(numeric_pattern_hits) > 0) { +# selected <- numeric_pattern_hits[1] +# return(list( +# values=cd[[selected]], +# column=selected, +# method="pattern_numeric_match", +# all_candidates=numeric_pattern_hits, +# coldata=cd +# )) +# } + +# stop( +# "Could not automatically identify Quality Score column.\n", +# "Available colData columns are:\n", +# paste(colnames(cd), collapse=", ") +# ) +# } + +# call_function_if_exists <- function(fun_name, x) { +# if (!exists(fun_name, mode="function")) { +# stop("Function not found in loaded SpaceTrooper branch: ", fun_name) +# } + +# fun <- get(fun_name, mode="function") +# fun(x) +# } + +# obj_before <- obj + +# if (run_spatial_qc) { +# message("Running spatial QC function...") +# obj <- call_function_if_exists(spatial_qc_fun, obj) +# } + +# message("Running Quality Score function...") +# obj_after <- call_function_if_exists(qscore_fun, obj) + +# message("Extracting Quality Score...") +# qs_info <- get_qscore_from_coldata(obj_after) + +# qs <- qs_info$values + +# if (!is.numeric(qs)) { +# warning("Detected Quality Score column is not numeric: ", qs_info$column) +# } + +# out <- list( +# git_branch=git_branch, +# git_hash=git_hash, +# package_path=normalizePath(package_path), +# input_rds=normalizePath(input_rds), +# spatial_qc_fun=spatial_qc_fun, +# qscore_fun=qscore_fun, +# run_spatial_qc=run_spatial_qc, +# qscore_column=qs_info$column, +# qscore_detection_method=qs_info$method, +# qscore_candidates=qs_info$all_candidates, +# n=length(qs), +# object_class=class(obj_after), +# object_dim=tryCatch(dim(obj_after), error=function(e) NULL), +# object_colnames=tryCatch(colnames(obj_after), error=function(e) NULL), +# quality_score=qs, +# quality_score_summary=summary(qs), +# colData=qs_info$coldata, +# session_info=sessionInfo() +# ) + +# saveRDS(out, output_rds) + +# message("Saved result to: ", output_rds) +# message("Detected Quality Score column: ", qs_info$column) +# message("Detection method: ", qs_info$method) +# message("Number of values: ", length(qs)) +# print(summary(qs)) + + +# computeQCScore(obj) + +# # model_formula <- paste0("~ log2CountArea + I(abs(log2AspectRatio) ", +# # "* as.numeric(dist_border<50)) + ", +# # " log2CountArea:I(abs(log2AspectRatio)", +# # "* as.numeric(dist_border<50))") #for cosmx diff --git a/inst/scripts/transfer_coeffs_cosmx_cosmx.R b/inst/scripts/transfer_coeffs_cosmx_cosmx.R new file mode 100644 index 0000000..d6d0aea --- /dev/null +++ b/inst/scripts/transfer_coeffs_cosmx_cosmx.R @@ -0,0 +1,730 @@ +## ============================================================ +## 0. Libraries and output directory +## ============================================================ + +devtools::load_all() +library(ggplot2) + +out_dir <- "~/SpaceTrooper_qs_check" + +set.seed(1998) + + +## ============================================================ +## 1. Define common CosMx model formula +## ============================================================ +## Since both datasets are CosMx, we can use the CosMx-specific formula, +## including the border-adjusted aspect ratio term and control ratio. + +cosmx_formula <- paste0( + "~(", + "log2SignalDensity + ", + "Area_um + ", + "I(abs(log2AspectRatio) * as.numeric(dist_border < 50)) + ", + "log2Ctrl_total_ratio", + ")^2" +) + + +## ============================================================ +## 2. Read and preprocess CosMx Breast dataset +## ============================================================ + +cosmx_breast_path <- "~/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2/" + +specosm_breast <- readCosmxSPE(cosmx_breast_path) + +specosm_breast <- spatialPerCellQC(specosm_breast) + + +## ============================================================ +## 3. Read and preprocess CosMx Pancreas dataset +## ============================================================ + +cosmx_pancreas_path <- "/Users/inzirio/Downloads/CosMx_data/Pancreas-CosMx-WTx" + +specosm_pancreas <- readCosmxSPE(cosmx_pancreas_path) + +specosm_pancreas <- spatialPerCellQC(specosm_pancreas) + + +## ============================================================ +## 4. Train native QS models on both datasets +## ============================================================ + +specosm_breast <- computeQCScore( + spe=specosm_breast, + modelFormula=cosmx_formula, + verbose=TRUE +) + +specosm_pancreas <- computeQCScore( + spe=specosm_pancreas, + modelFormula=cosmx_formula, + verbose=TRUE +) + + +## ============================================================ +## 5. Store native QS scores and trained models +## ============================================================ + +specosm_breast$QC_score_breast_native <- specosm_breast$QC_score +specosm_pancreas$QC_score_pancreas_native <- specosm_pancreas$QC_score + +breast_model <- metadata(specosm_breast)$QCScore_model +pancreas_model <- metadata(specosm_pancreas)$QCScore_model + + +## ============================================================ +## 6. Apply transferred models +## ============================================================ +## Apply Pancreas-trained model to Breast. +## Apply Breast-trained model to Pancreas. + +specosm_breast <- applyQCScoreModel( + spe=specosm_breast, + qcModel=pancreas_model, + scoreName="QC_score_pancreas_model" +) + +specosm_pancreas <- applyQCScoreModel( + spe=specosm_pancreas, + qcModel=breast_model, + scoreName="QC_score_breast_model" +) + + +## ============================================================ +## 7. Basic checks +## ============================================================ + +summary(specosm_breast$QC_score_breast_native) +summary(specosm_breast$QC_score_pancreas_model) + +summary(specosm_pancreas$QC_score_pancreas_native) +summary(specosm_pancreas$QC_score_breast_model) + +range(specosm_breast$QC_score_breast_native, na.rm=TRUE) +range(specosm_breast$QC_score_pancreas_model, na.rm=TRUE) + +range(specosm_pancreas$QC_score_pancreas_native, na.rm=TRUE) +range(specosm_pancreas$QC_score_breast_model, na.rm=TRUE) + +sum(is.na(specosm_breast$QC_score_breast_native)) +sum(is.na(specosm_breast$QC_score_pancreas_model)) + +sum(is.na(specosm_pancreas$QC_score_pancreas_native)) +sum(is.na(specosm_pancreas$QC_score_breast_model)) + + +## ============================================================ +## 8. Correlations: native vs transferred +## ============================================================ + +cor_breast_spearman <- cor( + specosm_breast$QC_score_breast_native, + specosm_breast$QC_score_pancreas_model, + use="complete.obs", + method="spearman" +) + +cor_breast_pearson <- cor( + specosm_breast$QC_score_breast_native, + specosm_breast$QC_score_pancreas_model, + use="complete.obs", + method="pearson" +) + +cor_pancreas_spearman <- cor( + specosm_pancreas$QC_score_pancreas_native, + specosm_pancreas$QC_score_breast_model, + use="complete.obs", + method="spearman" +) + +cor_pancreas_pearson <- cor( + specosm_pancreas$QC_score_pancreas_native, + specosm_pancreas$QC_score_breast_model, + use="complete.obs", + method="pearson" +) + +cor_breast_spearman +cor_breast_pearson + +cor_pancreas_spearman +cor_pancreas_pearson + + +## ============================================================ +## 9. Plot: Breast native QS vs Pancreas-trained QS +## ============================================================ + +res_breast <- plot_qc_score_comparison( + x=specosm_breast$QC_score_breast_native, + y=specosm_breast$QC_score_pancreas_model, + x_label="Breast native QS", + y_label="Pancreas-trained QS", + title="CosMx Breast: native QS vs Pancreas-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "cosmx_breast_native_vs_pancreas_model_A4_landscape.pdf" + ) +) + +res_breast$plot +res_breast$correlation +res_breast$quadrant_table + + +## ============================================================ +## 10. Plot: Pancreas native QS vs Breast-trained QS +## ============================================================ + +res_pancreas <- plot_qc_score_comparison( + x=specosm_pancreas$QC_score_pancreas_native, + y=specosm_pancreas$QC_score_breast_model, + x_label="Pancreas native QS", + y_label="Breast-trained QS", + title="CosMx Pancreas: native QS vs Breast-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "cosmx_pancreas_native_vs_breast_model_A4_landscape.pdf" + ) +) + +res_pancreas$plot +res_pancreas$correlation +res_pancreas$quadrant_table + + +## ============================================================ +## 11. Compare model formulas and coefficients +## ============================================================ + +metadata(specosm_breast)$QCScore_model$model_formula +metadata(specosm_pancreas)$QCScore_model$model_formula + +metadata(specosm_breast)$QCScore_model$model_matrix_colnames +metadata(specosm_pancreas)$QCScore_model$model_matrix_colnames + +metadata(specosm_breast)$QCScore_model$coefficients_table +metadata(specosm_pancreas)$QCScore_model$coefficients_table + + +## ============================================================ +## 12. Save summary table +## ============================================================ + +cosmx_transfer_summary <- data.frame( + comparison=c( + "Breast native vs Pancreas-trained", + "Pancreas native vs Breast-trained" + ), + spearman=c( + cor_breast_spearman, + cor_pancreas_spearman + ), + pearson=c( + cor_breast_pearson, + cor_pancreas_pearson + ), + stringsAsFactors=FALSE +) + +cosmx_transfer_summary + +write.csv( + cosmx_transfer_summary, + file=file.path(out_dir, "cosmx_breast_pancreas_transfer_summary.csv"), + row.names=FALSE +) + +plot_qc_score_coldata <- function(x, y, spe, color_col, + x_label="x", y_label="y", + title="QC score comparison", + threshold=0.75, + cor_method="spearman", + output_file=NULL) { + + stopifnot(length(x) == length(y)) + stopifnot(ncol(spe) == length(x)) + stopifnot(color_col %in% colnames(colData(spe))) + + df <- data.frame( + x=x, + y=y, + color_var=as.data.frame(colData(spe))[[color_col]] + ) + + cor_val <- cor( + df$x, + df$y, + use="complete.obs", + method=cor_method + ) + + p <- ggplot(df, aes(x=x, y=y, color=color_var)) + + geom_point(alpha=0.35, size=0.6) + + geom_abline( + slope=1, + intercept=0, + linetype="dashed", + color="red" + ) + + geom_hline(yintercept=threshold) + + geom_vline(xintercept=threshold) + + labs( + x=x_label, + y=y_label, + title=title, + color=color_col + ) + + theme_minimal() + + annotate( + "text", + x=quantile(df$x, 0.99, na.rm=TRUE), + y=quantile(df$y, 0.01, na.rm=TRUE), + label=paste0( + cor_method, + " r = ", + formatC(cor_val, digits=3, format="f") + ), + hjust=1, + vjust=0, + color="black" + ) + + if (!is.null(output_file)) { + ggsave( + filename=output_file, + plot=p, + device="pdf", + width=11.69, + height=8.27, + units="in" + ) + } + + return(list( + plot=p, + correlation=cor_val, + data=df + )) +} + + +res_breast_signal <- plot_qc_score_coldata( + x=specosm_breast$QC_score_breast_native, + y=specosm_breast$QC_score_pancreas_model, + spe=specosm_breast, + color_col="log2SignalDensity", + x_label="Breast native QS", + y_label="Pancreas-trained QS", + title="CosMx Breast: native QS vs Pancreas-trained QS", + threshold=0.75, + cor_method="spearman", + output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_log2SignalDensity.pdf" +) + +res_breast_signal$plot + +res_breast_area <- plot_qc_score_coldata( + x=specosm_breast$QC_score_breast_native, + y=specosm_breast$QC_score_pancreas_model, + spe=specosm_breast, + color_col="Area_um", + x_label="Breast native QS", + y_label="Pancreas-trained QS", + title="CosMx Breast: native QS vs Pancreas-trained QS", + threshold=0.75, + cor_method="spearman", + output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_Area_um.pdf" +) + +res_breast_area$plot + +res_breast_ctrl <- plot_qc_score_coldata( + x=specosm_breast$QC_score_breast_native, + y=specosm_breast$QC_score_pancreas_model, + spe=specosm_breast, + color_col="log2Ctrl_total_ratio", + x_label="Breast native QS", + y_label="Pancreas-trained QS", + title="CosMx Breast: native QS vs Pancreas-trained QS", + threshold=0.75, + cor_method="spearman", + output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_log2Ctrl_total_ratio.pdf" +) + +res_breast_ctrl$plot + +res_breast_aspect <- plot_qc_score_coldata( + x=specosm_breast$QC_score_breast_native, + y=specosm_breast$QC_score_pancreas_model, + spe=specosm_breast, + color_col="log2AspectRatio", + x_label="Breast native QS", + y_label="Pancreas-trained QS", + title="CosMx Breast: native QS vs Pancreas-trained QS", + threshold=0.75, + cor_method="spearman", + output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_log2AspectRatio.pdf" +) + +res_breast_aspect$plot + +SpaceTrooper::plotMetricHist(spe = specosm_breast, metric="log2Ctrl_total_ratio") +SpaceTrooper::plotMetricHist(spe = specosm_pancreas, metric="log2Ctrl_total_ratio") + +############################################################ +specosm_mb1 <- readCosmxSPE("/Users/inzirio/Downloads/CosMx_data/CosMx1k_MouseBrain1") +specosm_mb2 <- readCosmxSPE("/Users/inzirio/Downloads/CosMx_data/CosMx1k_MouseBrain2") + + +specosm_mb1 <- spatialPerCellQC(specosm_mb1) +specosm_mb2 <- spatialPerCellQC(specosm_mb2) + +cosmx_formula <- paste0( + "~(", + "log2SignalDensity + ", + "Area_um + ", + "I(abs(log2AspectRatio) * as.numeric(dist_border < 50)) + ", + "log2Ctrl_total_ratio", + ")^2" +) + +specosm_mb1 <- computeQCScore( + spe=specosm_mb1, + modelFormula=cosmx_formula, + verbose=TRUE +) + +specosm_mb2 <- computeQCScore( + spe=specosm_mb2, + modelFormula=cosmx_formula, + verbose=TRUE +) + +specosm_mb1$QC_score_mb1_native <- specosm_mb1$QC_score +specosm_mb2$QC_score_mb2_native <- specosm_mb2$QC_score + +mb1_model <- metadata(specosm_mb1)$QCScore_model +mb2_model <- metadata(specosm_mb2)$QCScore_model + +specosm_mb1 <- applyQCScoreModel( + spe=specosm_mb1, + qcModel=mb2_model, + scoreName="QC_score_mb2_model" +) + +specosm_mb2 <- applyQCScoreModel( + spe=specosm_mb2, + qcModel=mb1_model, + scoreName="QC_score_mb1_model" +) + +cor_mb1_spearman <- cor( + specosm_mb1$QC_score_mb1_native, + specosm_mb1$QC_score_mb2_model, + use="complete.obs", + method="spearman" +) + +cor_mb1_pearson <- cor( + specosm_mb1$QC_score_mb1_native, + specosm_mb1$QC_score_mb2_model, + use="complete.obs", + method="pearson" +) + +cor_mb2_spearman <- cor( + specosm_mb2$QC_score_mb2_native, + specosm_mb2$QC_score_mb1_model, + use="complete.obs", + method="spearman" +) + +cor_mb2_pearson <- cor( + specosm_mb2$QC_score_mb2_native, + specosm_mb2$QC_score_mb1_model, + use="complete.obs", + method="pearson" +) + +res_mb1 <- plot_qc_score_comparison( + x=specosm_mb1$QC_score_mb1_native, + y=specosm_mb1$QC_score_mb2_model, + x_label="MouseBrain1 native QS", + y_label="MouseBrain2-trained QS", + title="CosMx MouseBrain1: native QS vs MouseBrain2-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "cosmx_mousebrain1_native_vs_mousebrain2_model_A4_landscape.pdf" + ) +) + +res_mb2 <- plot_qc_score_comparison( + x=specosm_mb2$QC_score_mb2_native, + y=specosm_mb2$QC_score_mb1_model, + x_label="MouseBrain2 native QS", + y_label="MouseBrain1-trained QS", + title="CosMx MouseBrain2: native QS vs MouseBrain1-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "cosmx_mousebrain2_native_vs_mousebrain1_model_A4_landscape.pdf" + ) +) + +res_mb1$plot +res_mb2$plot + +color_vars <- c( + "log2SignalDensity", + "Area_um", + "log2AspectRatio", + "log2Ctrl_total_ratio" +) + +mb1_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=specosm_mb1$QC_score_mb1_native, + y=specosm_mb1$QC_score_mb2_model, + spe=specosm_mb1, + color_col=v, + x_label="MouseBrain1 native QS", + y_label="MouseBrain2-trained QS", + title=paste0("MouseBrain1 native vs MouseBrain2-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("cosmx_mousebrain1_native_vs_mousebrain2_model_", v, ".pdf") + ) + ) +}) +names(mb1_colored_plots) <- color_vars + +mb2_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=specosm_mb2$QC_score_mb2_native, + y=specosm_mb2$QC_score_mb1_model, + spe=specosm_mb2, + color_col=v, + x_label="MouseBrain2 native QS", + y_label="MouseBrain1-trained QS", + title=paste0("MouseBrain2 native vs MouseBrain1-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("cosmx_mousebrain2_native_vs_mousebrain1_model_", v, ".pdf") + ) + ) +}) +names(mb2_colored_plots) <- color_vars +mb2_colored_plots +cosmx_mousebrain_transfer_summary <- data.frame( + comparison=c( + "MouseBrain1 native vs MouseBrain2-trained", + "MouseBrain2 native vs MouseBrain1-trained" + ), + spearman=c(cor_mb1_spearman, cor_mb2_spearman), + pearson=c(cor_mb1_pearson, cor_mb2_pearson), + stringsAsFactors=FALSE +) + +cosmx_mousebrain_transfer_summary + +write.csv( + cosmx_mousebrain_transfer_summary, + file=file.path(out_dir, "cosmx_mousebrain1_mousebrain2_transfer_summary.csv"), + row.names=FALSE +) + +metadata(specosm_mb1)$QCScore_model$model_matrix_colnames +metadata(specosm_mb2)$QCScore_model$model_matrix_colnames + +#################### + +spexen_bc1 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast1_Janesick/") +spexen_bc2 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast2_Janesick") + + +spexen_bc1 <- spatialPerCellQC(spexen_bc1) +spexen_bc2 <- spatialPerCellQC(spexen_bc2) + +xenium_formula <- paste0( + "~(", + "log2SignalDensity + ", + "Area_um + ", + "log2AspectRatio + ", + "log2Ctrl_total_ratio", + ")^2" +) + +spexen_bc1 <- computeQCScore( + spe=spexen_bc1, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen_bc2 <- computeQCScore( + spe=spexen_bc2, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen_bc1$QC_score_bc1_native <- spexen_bc1$QC_score +spexen_bc2$QC_score_bc2_native <- spexen_bc2$QC_score + +bc1_model <- metadata(spexen_bc1)$QCScore_model +bc2_model <- metadata(spexen_bc2)$QCScore_model + +spexen_bc1 <- applyQCScoreModel( + spe=spexen_bc1, + qcModel=bc2_model, + scoreName="QC_score_bc2_model" +) + +spexen_bc2 <- applyQCScoreModel( + spe=spexen_bc2, + qcModel=bc1_model, + scoreName="QC_score_bc1_model" +) + +cor_bc1_spearman <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_bc2_model, + use="complete.obs", + method="spearman" +) + +cor_bc1_pearson <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_bc2_model, + use="complete.obs", + method="pearson" +) + +cor_bc2_spearman <- cor( + spexen_bc2$QC_score_bc2_native, + spexen_bc2$QC_score_bc1_model, + use="complete.obs", + method="spearman" +) + +cor_bc2_pearson <- cor( + spexen_bc2$QC_score_bc2_native, + spexen_bc2$QC_score_bc1_model, + use="complete.obs", + method="pearson" +) + +res_bc1 <- plot_qc_score_comparison( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_bc2_model, + x_label="Breast Cancer 1 native QS", + y_label="Breast Cancer 2-trained QS", + title="Xenium Breast Cancer 1: native QS vs Breast Cancer 2-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_breast1_native_vs_breast2_model_A4_landscape.pdf" + ) +) + +res_bc2 <- plot_qc_score_comparison( + x=spexen_bc2$QC_score_bc2_native, + y=spexen_bc2$QC_score_bc1_model, + x_label="Breast Cancer 2 native QS", + y_label="Breast Cancer 1-trained QS", + title="Xenium Breast Cancer 2: native QS vs Breast Cancer 1-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_breast2_native_vs_breast1_model_A4_landscape.pdf" + ) +) + +res_bc1$plot +res_bc2$plot + +color_vars <- c( + "log2SignalDensity", + "Area_um", + "log2AspectRatio", + "log2Ctrl_total_ratio" +) + +bc1_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_bc2_model, + spe=spexen_bc1, + color_col=v, + x_label="Breast Cancer 1 native QS", + y_label="Breast Cancer 2-trained QS", + title=paste0("Breast Cancer 1 native vs Breast Cancer 2-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_breast1_native_vs_breast2_model_", v, ".pdf") + ) + ) +}) +names(bc1_colored_plots) <- color_vars + +bc2_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen_bc2$QC_score_bc2_native, + y=spexen_bc2$QC_score_bc1_model, + spe=spexen_bc2, + color_col=v, + x_label="Breast Cancer 2 native QS", + y_label="Breast Cancer 1-trained QS", + title=paste0("Breast Cancer 2 native vs Breast Cancer 1-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_breast2_native_vs_breast1_model_", v, ".pdf") + ) + ) +}) +names(bc2_colored_plots) <- color_vars + +xenium_breast_transfer_summary <- data.frame( + comparison=c( + "Breast Cancer 1 native vs Breast Cancer 2-trained", + "Breast Cancer 2 native vs Breast Cancer 1-trained" + ), + spearman=c(cor_bc1_spearman, cor_bc2_spearman), + pearson=c(cor_bc1_pearson, cor_bc2_pearson), + stringsAsFactors=FALSE +) + +xenium_breast_transfer_summary + +write.csv( + xenium_breast_transfer_summary, + file=file.path(out_dir, "xenium_breast1_breast2_transfer_summary.csv"), + row.names=FALSE +) + +metadata(spexen_bc1)$QCScore_model$model_matrix_colnames +metadata(spexen_bc2)$QCScore_model$model_matrix_colnames + +######################## diff --git a/inst/scripts/transfer_coeffs.R b/inst/scripts/transfer_coeffs_cosmx_xenium.R similarity index 56% rename from inst/scripts/transfer_coeffs.R rename to inst/scripts/transfer_coeffs_cosmx_xenium.R index 19812b4..ae260f5 100644 --- a/inst/scripts/transfer_coeffs.R +++ b/inst/scripts/transfer_coeffs_cosmx_xenium.R @@ -1,6 +1,6 @@ library(SpaceTrooper) -library(SummarizedExperiment) -library(S4Vectors) +# library(SummarizedExperiment) +# library(S4Vectors) set.seed(1998) @@ -22,7 +22,19 @@ spexen <- computeQCScore( verbose=TRUE ) -xen_model <- metadata(spexen)$QCScore_model +sum(is.na(spexen$QC_score)) +colSums(is.na(as.data.frame(colData(spexen))[ + , c("log2SignalDensity", "Area_um", "log2AspectRatio") +])) + +length(spexen$QC_score) == ncol(spexen) + +sum(is.na(spexen$QC_score)) +summary(spexen$QC_score) + +metadata(spexen)$QCScore_model$model_formula +metadata(spexen)$QCScore_model$model_matrix_colnames +metadata(spexen)$QCScore_model$coefficients_table ## ----------------------------- @@ -41,8 +53,14 @@ specosm <- computeQCScore( verbose=TRUE ) -cosmx_model <- metadata(specosm)$QCScore_model +length(specosm$QC_score) == ncol(specosm) +sum(is.na(specosm$QC_score)) +summary(specosm$QC_score) + +metadata(specosm)$QCScore_model$model_formula +metadata(specosm)$QCScore_model$model_matrix_colnames +metadata(specosm)$QCScore_model$coefficients_table ## ----------------------------- ## Save native scores before transfer @@ -51,6 +69,8 @@ cosmx_model <- metadata(specosm)$QCScore_model spexen$QC_score_xenium_native <- spexen$QC_score specosm$QC_score_cosmx_native <- specosm$QC_score +xen_model <- metadata(spexen)$QCScore_model +cosmx_model <- metadata(specosm)$QCScore_model ## ----------------------------- ## Apply transferred models @@ -124,3 +144,57 @@ print(metadata(spexen)$QCScore_model$coefficients_table) message("CosMx model coefficients:") print(metadata(specosm)$QCScore_model$coefficients_table) + + +## ----------------------------- +## Correlations of scores with model variables +## ----------------------------- +cor( + spexen$QC_score_xenium_native, + spexen$QC_score_cosmx_model, + use="complete.obs", + method="spearman" +) + +cor( + specosm$QC_score_cosmx_native, + specosm$QC_score_xenium_model, + use="complete.obs", + method="spearman" +) + +range(spexen$QC_score_cosmx_model, na.rm=TRUE) +sum(is.na(spexen$QC_score_cosmx_model)) + +range(specosm$QC_score_xenium_model, na.rm=TRUE) +sum(is.na(specosm$QC_score_xenium_model)) + +res_xen <- plot_qc_score_comparison( + x=spexen$QC_score_xenium_native, + y=spexen$QC_score_cosmx_model, + x_label="Native Xenium QC score", + y_label="CosMx-trained QC score", + title="Xenium: native vs CosMx-trained QC score", + cor_method = "pearson", + threshold=0.25, + output_file="~/SpaceTrooper_qs_check/xenium_native_vs_cosmx_model.pdf" +) + +res_xen$plot +res_xen$correlation +res_xen$quadrant_table + +res_cosmx <- plot_qc_score_comparison( + x=specosm$QC_score_cosmx_native, + y=specosm$QC_score_xenium_model, + x_label="Native CosMx QC score", + y_label="Xenium-trained QC score", + title="CosMx: native vs Xenium-trained QC score", + cor_method = "pearson", + threshold=0.75, + output_file="~/SpaceTrooper_qs_check/cosmx_native_vs_xenium_model.pdf" +) + +res_cosmx$plot +res_cosmx$correlation +res_cosmx$quadrant_table diff --git a/inst/scripts/transfer_coeffs_xenium_xenium.R b/inst/scripts/transfer_coeffs_xenium_xenium.R new file mode 100644 index 0000000..7ed919e --- /dev/null +++ b/inst/scripts/transfer_coeffs_xenium_xenium.R @@ -0,0 +1,348 @@ +spexen_bc1 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast1_Janesick/") +spexen_bc2 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast2_Janesick") + + +spexen_bc1 <- spatialPerCellQC(spexen_bc1) +spexen_bc2 <- spatialPerCellQC(spexen_bc2) + +xenium_formula <- paste0( + "~(", + "log2SignalDensity + ", + "Area_um + ", + "log2AspectRatio + ", + "log2Ctrl_total_ratio", + ")^2" +) + +spexen_bc1 <- computeQCScore( + spe=spexen_bc1, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen_bc2 <- computeQCScore( + spe=spexen_bc2, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen_bc1$QC_score_bc1_native <- spexen_bc1$QC_score +spexen_bc2$QC_score_bc2_native <- spexen_bc2$QC_score + +bc1_model <- metadata(spexen_bc1)$QCScore_model +bc2_model <- metadata(spexen_bc2)$QCScore_model + +spexen_bc1 <- applyQCScoreModel( + spe=spexen_bc1, + qcModel=bc2_model, + scoreName="QC_score_bc2_model" +) + +spexen_bc2 <- applyQCScoreModel( + spe=spexen_bc2, + qcModel=bc1_model, + scoreName="QC_score_bc1_model" +) + +cor_bc1_spearman <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_bc2_model, + use="complete.obs", + method="spearman" +) + +cor_bc1_pearson <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_bc2_model, + use="complete.obs", + method="pearson" +) + +cor_bc2_spearman <- cor( + spexen_bc2$QC_score_bc2_native, + spexen_bc2$QC_score_bc1_model, + use="complete.obs", + method="spearman" +) + +cor_bc2_pearson <- cor( + spexen_bc2$QC_score_bc2_native, + spexen_bc2$QC_score_bc1_model, + use="complete.obs", + method="pearson" +) + +res_bc1 <- plot_qc_score_comparison( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_bc2_model, + x_label="Breast Cancer 1 native QS", + y_label="Breast Cancer 2-trained QS", + title="Xenium Breast Cancer 1: native QS vs Breast Cancer 2-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_breast1_native_vs_breast2_model_A4_landscape.pdf" + ) +) + +res_bc2 <- plot_qc_score_comparison( + x=spexen_bc2$QC_score_bc2_native, + y=spexen_bc2$QC_score_bc1_model, + x_label="Breast Cancer 2 native QS", + y_label="Breast Cancer 1-trained QS", + title="Xenium Breast Cancer 2: native QS vs Breast Cancer 1-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_breast2_native_vs_breast1_model_A4_landscape.pdf" + ) +) + +res_bc1$plot +res_bc2$plot + +color_vars <- c( + "log2SignalDensity", + "Area_um", + "log2AspectRatio", + "log2Ctrl_total_ratio" +) + +bc1_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_bc2_model, + spe=spexen_bc1, + color_col=v, + x_label="Breast Cancer 1 native QS", + y_label="Breast Cancer 2-trained QS", + title=paste0("Breast Cancer 1 native vs Breast Cancer 2-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_breast1_native_vs_breast2_model_", v, ".pdf") + ) + ) +}) +names(bc1_colored_plots) <- color_vars + +bc2_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen_bc2$QC_score_bc2_native, + y=spexen_bc2$QC_score_bc1_model, + spe=spexen_bc2, + color_col=v, + x_label="Breast Cancer 2 native QS", + y_label="Breast Cancer 1-trained QS", + title=paste0("Breast Cancer 2 native vs Breast Cancer 1-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_breast2_native_vs_breast1_model_", v, ".pdf") + ) + ) +}) +names(bc2_colored_plots) <- color_vars + +xenium_breast_transfer_summary <- data.frame( + comparison=c( + "Breast Cancer 1 native vs Breast Cancer 2-trained", + "Breast Cancer 2 native vs Breast Cancer 1-trained" + ), + spearman=c(cor_bc1_spearman, cor_bc2_spearman), + pearson=c(cor_bc1_pearson, cor_bc2_pearson), + stringsAsFactors=FALSE +) + +xenium_breast_transfer_summary + +write.csv( + xenium_breast_transfer_summary, + file=file.path(out_dir, "xenium_breast1_breast2_transfer_summary.csv"), + row.names=FALSE +) + +metadata(spexen_bc1)$QCScore_model$model_matrix_colnames +metadata(spexen_bc2)$QCScore_model$model_matrix_colnames + +######################## + +dbkx_path <- "~/Downloads/Xenium_data/db_kero_xen" + +spexen <- readXeniumSPE(dbkx_path) +spexen_bc1 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast1_Janesick/") + + + +spexen <- spatialPerCellQC(spexen) +spexen_bc1 <- spatialPerCellQC(spexen_bc1) + +xenium_formula <- paste0( + "~(", + "log2SignalDensity + ", + "Area_um + ", + "log2AspectRatio + ", + "log2Ctrl_total_ratio", + ")^2" +) + +spexen <- computeQCScore( + spe=spexen, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen_bc1 <- computeQCScore( + spe=spexen_bc1, + modelFormula=xenium_formula, + verbose=TRUE +) + +spexen$QC_score_dbk_native <- spexen$QC_score +spexen_bc1$QC_score_bc1_native <- spexen_bc1$QC_score + +dbk_model <- metadata(spexen)$QCScore_model +bc1_model <- metadata(spexen_bc1)$QCScore_model + +spexen <- applyQCScoreModel( + spe=spexen, + qcModel=bc1_model, + scoreName="QC_score_bc1_model" +) + +spexen_bc1 <- applyQCScoreModel( + spe=spexen_bc1, + qcModel=dbk_model, + scoreName="QC_score_dbk_model" +) + +cor_dbk_spearman <- cor( + spexen$QC_score_dbk_native, + spexen$QC_score_bc1_model, + use="complete.obs", + method="spearman" +) + +cor_dbk_pearson <- cor( + spexen$QC_score_dbk_native, + spexen$QC_score_bc1_model, + use="complete.obs", + method="pearson" +) + +cor_bc1_spearman <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_dbk_model, + use="complete.obs", + method="spearman" +) + +cor_bc1_pearson <- cor( + spexen_bc1$QC_score_bc1_native, + spexen_bc1$QC_score_dbk_model, + use="complete.obs", + method="pearson" +) + +res_dbk <- plot_qc_score_comparison( + x=spexen$QC_score_dbk_native, + y=spexen$QC_score_bc1_model, + x_label="DBK Xenium native QS", + y_label="Breast 1-trained QS", + title="Xenium DBK: native QS vs Breast 1-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_dbk_native_vs_breast1_model_A4_landscape.pdf" + ) +) + +res_bc1 <- plot_qc_score_comparison( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_dbk_model, + x_label="Breast 1 native QS", + y_label="DBK-trained QS", + title="Xenium Breast 1: native QS vs DBK-trained QS", + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + "xenium_breast1_native_vs_dbk_model_A4_landscape.pdf" + ) +) + +res_dbk$plot +res_bc1$plot + +color_vars <- c( + "log2SignalDensity", + "Area_um", + "log2AspectRatio", + "log2Ctrl_total_ratio" +) + +dbk_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen$QC_score_dbk_native, + y=spexen$QC_score_bc1_model, + spe=spexen, + color_col=v, + x_label="DBK Xenium native QS", + y_label="Breast 1-trained QS", + title=paste0("DBK Xenium native vs Breast 1-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_dbk_native_vs_breast1_model_", v, ".pdf") + ) + ) +}) +names(dbk_colored_plots) <- color_vars + +bc1_colored_plots <- lapply(color_vars, function(v) { + plot_qc_score_coldata( + x=spexen_bc1$QC_score_bc1_native, + y=spexen_bc1$QC_score_dbk_model, + spe=spexen_bc1, + color_col=v, + x_label="Breast 1 native QS", + y_label="DBK-trained QS", + title=paste0("Breast 1 native vs DBK-trained QS: ", v), + threshold=0.75, + cor_method="spearman", + output_file=file.path( + out_dir, + paste0("xenium_breast1_native_vs_dbk_model_", v, ".pdf") + ) + ) +}) +names(bc1_colored_plots) <- color_vars + +xenium_dbk_breast1_transfer_summary <- data.frame( + comparison=c( + "DBK Xenium native vs Breast 1-trained", + "Breast 1 native vs DBK-trained" + ), + spearman=c(cor_dbk_spearman, cor_bc1_spearman), + pearson=c(cor_dbk_pearson, cor_bc1_pearson), + stringsAsFactors=FALSE +) + +xenium_dbk_breast1_transfer_summary + +write.csv( + xenium_dbk_breast1_transfer_summary, + file=file.path(out_dir, "xenium_dbk_breast1_transfer_summary.csv"), + row.names=FALSE +) + +metadata(spexen)$QCScore_model$model_matrix_colnames +metadata(spexen_bc1)$QCScore_model$model_matrix_colnames + +######################### From faa1af5f52a0c2a49be3800372ec1c9b76d3550c Mon Sep 17 00:00:00 2001 From: Dario Date: Mon, 25 May 2026 11:25:32 +0200 Subject: [PATCH 06/10] adding citation file --- inst/CITATION | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 inst/CITATION diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 0000000..e0a7e38 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,23 @@ +bibentry( + bibtype="Article", + title="SpaceTrooper: a quality control framework for imaging-based spatial omics data", + author=c( + person("Benedetta", "Banzi"), + person("Dario", "Righelli"), + person("Matteo", "Marchionni"), + person("Oriana", "Romano"), + person("Mattia", "Forcato"), + person("Davide", "Risso"), + person("Silvio", "Bicciato") + ), + journal="bioRxiv", + year="2025", + doi="10.64898/2025.12.24.696336", + url="https://doi.org/10.64898/2025.12.24.696336", + textVersion=paste( + "Banzi B, Righelli D, Marchionni M, Romano O, Forcato M,", + "Risso D, Bicciato S. SpaceTrooper: a quality control framework", + "for imaging-based spatial omics data. bioRxiv, 2025.", + "doi:10.64898/2025.12.24.696336" + ) +) From c732c22608ef5f364488da5602db50f2dd277fe7 Mon Sep 17 00:00:00 2001 From: Dario Date: Mon, 25 May 2026 11:35:10 +0200 Subject: [PATCH 07/10] news version 1.1.8 --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index b2c76b3..27c1dce 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# Changes in version 1.1.8 + +* fixing naming of of spacetrooper utilities vignette +* adding functions for QC model transfer across datasets +* several new scripts for QC tranferring and comparison +* adding citation file with biorxiv paper + # Changes in version 1.1.7 * fixing author name typo From 031dc0cdd5d1a5c0f42f95ac50703161188c864f Mon Sep 17 00:00:00 2001 From: Dario Date: Mon, 25 May 2026 15:04:37 +0200 Subject: [PATCH 08/10] fixing errors on check --- NAMESPACE | 3 +++ R/QC.R | 11 ++++++++- man/applyQCScoreModel.Rd | 48 ++++++++++++++++++++++++++++++++++++++++ man/computeQCScore.Rd | 11 ++++++++- 4 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 man/applyQCScoreModel.Rd diff --git a/NAMESPACE b/NAMESPACE index 4e9db72..cc92c05 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(.getActiveGeometryName) export(.renameGeometry) export(.setActiveGeometry) export(addPolygonsToSPE) +export(applyQCScoreModel) export(checkOutliers) export(computeAreaFromPolygons) export(computeAspectRatioFromPolygons) @@ -126,6 +127,8 @@ importFrom(sf,st_sf) importFrom(sf,st_sfc) importFrom(sfheaders,sf_polygon) importFrom(stats,as.formula) +importFrom(stats,coef) +importFrom(stats,complete.cases) importFrom(stats,model.matrix) importFrom(stats,predict) importFrom(stats,quantile) diff --git a/R/QC.R b/R/QC.R index 6d69efd..78f5798 100644 --- a/R/QC.R +++ b/R/QC.R @@ -441,12 +441,20 @@ computeLambda <- function(trainDF, modelFormula) { #' @param spe A `SpatialExperiment` object with spatial transcriptomics data. #' @param verbose logical for having a verbose output. Default is FALSE. #' @param bestLambda the best lambda typically computed using `computeLambda`. +#' @param modelFormula a character string representing the model formula to be +#' used for training the model. If NULL, the formula is automatically generated +#' based on the available metrics and outliers in the dataset. +#' See Details for more information. +#' Note that the automatically generated formula will include interaction +#' terms between the metrics, and will exclude metrics with insufficient +#' outliers (< 0.1% of the dataset). If a custom `modelFormula` is provided, +#' it will be used as is without modification or checks for outlier counts. #' #' @return The `SpatialExperiment` object with added QC score in `colData`. #' @export #' @importFrom dplyr case_when filter mutate distinct pull #' @importFrom glmnet glmnet cv.glmnet -#' @importFrom stats as.formula model.matrix quantile predict +#' @importFrom stats as.formula model.matrix quantile predict coef #' @examples #' example(spatialPerCellQC) #' set.seed(1998) @@ -1323,6 +1331,7 @@ applyQCScoreModel <- function(spe, qcModel, scoreName="QC_score") { return(modelMatrix) } +#' @importFrom stats complete.cases .filterCompleteModelCases <- function(df, modelFormula, response=NULL, context="cells") { diff --git a/man/applyQCScoreModel.Rd b/man/applyQCScoreModel.Rd new file mode 100644 index 0000000..9bea5d2 --- /dev/null +++ b/man/applyQCScoreModel.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/QC.R +\name{applyQCScoreModel} +\alias{applyQCScoreModel} +\title{applyQCScoreModel} +\usage{ +applyQCScoreModel(spe, qcModel, scoreName = "QC_score") +} +\arguments{ +\item{spe}{A `SpatialExperiment` object with QC metrics already computed.} + +\item{qcModel}{A QC score model object, usually stored in +`metadata(spe)$QCScore_model`.} + +\item{scoreName}{Name of the output column in `colData`.} +} +\value{ +A `SpatialExperiment` object with added QC score in `colData`. +} +\description{ +Apply a previously trained QC score model to a new SpatialExperiment object. +} +\examples{ + +example(readCosmxSPE) + +## Compute per-cell quality control metrics +spe <- spatialPerCellQC(spe) + +## Split the object only for example purposes +set.seed(1998) +idx <- sample(seq_len(ncol(spe)), floor(ncol(spe) / 2)) +spe_train <- spe[, idx] +spe_test <- spe[, -idx] + +## Train the Quality Control (QC) score model on one dataset +spe_train <- computeQCScore(spe_train) +qc_model <- metadata(spe_train)$QCScore_model + +## Apply the trained model to another dataset +spe_test <- applyQCScoreModel( + spe=spe_test, + qcModel=qc_model, + scoreName="QC_score_transferred" +) + +summary(spe_test$QC_score_transferred) +} diff --git a/man/computeQCScore.Rd b/man/computeQCScore.Rd index a9fce89..0af8503 100644 --- a/man/computeQCScore.Rd +++ b/man/computeQCScore.Rd @@ -4,13 +4,22 @@ \alias{computeQCScore} \title{computeQCScore} \usage{ -computeQCScore(spe, bestLambda = NULL, verbose = FALSE) +computeQCScore(spe, bestLambda = NULL, modelFormula = NULL, verbose = FALSE) } \arguments{ \item{spe}{A `SpatialExperiment` object with spatial transcriptomics data.} \item{bestLambda}{the best lambda typically computed using `computeLambda`.} +\item{modelFormula}{a character string representing the model formula to be +used for training the model. If NULL, the formula is automatically generated +based on the available metrics and outliers in the dataset. +See Details for more information. +Note that the automatically generated formula will include interaction +terms between the metrics, and will exclude metrics with insufficient +outliers (< 0.1% of the dataset). If a custom `modelFormula` is provided, +it will be used as is without modification or checks for outlier counts.} + \item{verbose}{logical for having a verbose output. Default is FALSE.} } \value{ From f4564e40b7894be21d90fe36f37c0c89b57fb4af Mon Sep 17 00:00:00 2001 From: Dario Date: Wed, 10 Jun 2026 16:35:39 +0200 Subject: [PATCH 09/10] removing unuseful scripts --- NEWS.md | 1 - inst/scripts/ablation_study.R | 484 ------------ inst/scripts/compare_qs_general.R | 79 -- inst/scripts/create_cosmx_rds.R | 39 - inst/scripts/evaluate_qs_split_kfold.R | 726 ------------------ inst/scripts/run_qs_general.R | 275 ------- inst/scripts/transfer_coeffs_cosmx_cosmx.R | 730 ------------------- inst/scripts/transfer_coeffs_cosmx_xenium.R | 200 ----- inst/scripts/transfer_coeffs_xenium_xenium.R | 348 --------- inst/scripts/transfer_qs_coefficients.R | 665 ----------------- 10 files changed, 3547 deletions(-) delete mode 100644 inst/scripts/ablation_study.R delete mode 100644 inst/scripts/compare_qs_general.R delete mode 100644 inst/scripts/create_cosmx_rds.R delete mode 100644 inst/scripts/evaluate_qs_split_kfold.R delete mode 100644 inst/scripts/run_qs_general.R delete mode 100644 inst/scripts/transfer_coeffs_cosmx_cosmx.R delete mode 100644 inst/scripts/transfer_coeffs_cosmx_xenium.R delete mode 100644 inst/scripts/transfer_coeffs_xenium_xenium.R delete mode 100644 inst/scripts/transfer_qs_coefficients.R diff --git a/NEWS.md b/NEWS.md index 27c1dce..0887798 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,6 @@ * fixing naming of of spacetrooper utilities vignette * adding functions for QC model transfer across datasets -* several new scripts for QC tranferring and comparison * adding citation file with biorxiv paper # Changes in version 1.1.7 diff --git a/inst/scripts/ablation_study.R b/inst/scripts/ablation_study.R deleted file mode 100644 index 9cd3777..0000000 --- a/inst/scripts/ablation_study.R +++ /dev/null @@ -1,484 +0,0 @@ -## ============================================================ -## 0. Libraries and output directory -## ============================================================ - -library(SpaceTrooper) -library(ggplot2) - -out_dir <- "~/SpaceTrooper_qs_check" - -if (!dir.exists(out_dir)) { - dir.create(out_dir, recursive=TRUE) -} - - -## ============================================================ -## 1. Define ablation formulas -## ============================================================ -## The full model contains all Quality Score (QS) components. -## Each ablated model removes one component at a time. - -formulas <- list( - full="~(log2SignalDensity + Area_um + log2AspectRatio + log2Ctrl_total_ratio)^2", - - no_log2SignalDensity= - "~(Area_um + log2AspectRatio + log2Ctrl_total_ratio)^2", - - no_Area_um= - "~(log2SignalDensity + log2AspectRatio + log2Ctrl_total_ratio)^2", - - no_log2AspectRatio= - "~(log2SignalDensity + Area_um + log2Ctrl_total_ratio)^2", - - no_log2Ctrl_total_ratio= - "~(log2SignalDensity + Area_um + log2AspectRatio)^2" -) - - -## ============================================================ -## 2. Function to run QS ablation models -## ============================================================ -## For each formula, computeQCScore() is run with a forced model formula. -## The output stores: -## - the computed QS vector -## - the trained QS model stored in metadata(spe)$QCScore_model - -run_qs_ablation <- function(spe, formulas, verbose=FALSE) { - out <- list() - - for (nm in names(formulas)) { - message("Running model: ", nm) - - spe_i <- computeQCScore( - spe=spe, - modelFormula=formulas[[nm]], - verbose=verbose - ) - - out[[nm]] <- list( - spe=spe_i, - score=spe_i$QC_score, - model=metadata(spe_i)$QCScore_model - ) - } - - return(out) -} - - -## ============================================================ -## 3. Read and preprocess CosMx data -## ============================================================ -## The object is read, then per-cell QC metrics are computed. -## These metrics are required by computeQCScore(). - -cosm_path <- "~/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2/" - -specosm <- readCosmxSPE(cosm_path) - -specosm <- spatialPerCellQC(specosm) - - -## ============================================================ -## 4. Run ablation study on CosMx -## ============================================================ -## This computes the full QS and one QS for each ablated formula. - -set.seed(1998) - -abl_cosmx <- run_qs_ablation( - spe=specosm, - formulas=formulas, - verbose=TRUE -) - - -## ============================================================ -## 5. Functions to summarize ablation results -## ============================================================ -## bottom_overlap() compares the low-QS tails of two score vectors. -## By default q=0.10 means the bottom 10% of cells. -## -## summarize_qs_ablation() compares every ablated QS against the full QS -## using: -## - Pearson correlation -## - Spearman correlation -## - absolute score differences -## - overlap of bottom-tail low-quality cells - -bottom_overlap <- function(x, y, q=0.10) { - x_bad <- x <= quantile(x, probs=q, na.rm=TRUE) - y_bad <- y <= quantile(y, probs=q, na.rm=TRUE) - - overlap <- sum(x_bad & y_bad, na.rm=TRUE) - union <- sum(x_bad | y_bad, na.rm=TRUE) - - data.frame( - quantile=q, - overlap=overlap, - union=union, - jaccard=overlap / union, - prop_ref_recovered=overlap / sum(x_bad, na.rm=TRUE) - ) -} - - -summarize_qs_ablation <- function(ablation_results, reference="full", q=0.10) { - ref_score <- ablation_results[[reference]]$score - - res <- lapply(names(ablation_results), function(nm) { - score <- ablation_results[[nm]]$score - - ov <- bottom_overlap(ref_score, score, q=q) - - data.frame( - model=nm, - pearson=cor( - ref_score, - score, - use="complete.obs", - method="pearson" - ), - spearman=cor( - ref_score, - score, - use="complete.obs", - method="spearman" - ), - mean_abs_diff=mean(abs(ref_score - score), na.rm=TRUE), - median_abs_diff=median(abs(ref_score - score), na.rm=TRUE), - max_abs_diff=max(abs(ref_score - score), na.rm=TRUE), - bottom_jaccard=ov$jaccard, - bottom_ref_recovered=ov$prop_ref_recovered, - n_na=sum(is.na(score)), - stringsAsFactors=FALSE - ) - }) - - do.call(rbind, res) -} - - -## ============================================================ -## 6. Build ablation summary table -## ============================================================ - -tab_cosmx <- summarize_qs_ablation( - ablation_results=abl_cosmx, - reference="full", - q=0.10 -) - -tab_cosmx - - -## ============================================================ -## 7. Pairwise plots: full QS vs each ablated QS -## ============================================================ -## This assumes plot_qc_score_comparison() is already available. -## Each plot is also saved as an A4 landscape PDF. - -ablation_names <- setdiff(names(abl_cosmx), "full") - -plots_cosmx <- lapply(ablation_names, function(nm) { - plot_qc_score_comparison( - x=abl_cosmx$full$score, - y=abl_cosmx[[nm]]$score, - x_label="Full QS", - y_label=paste0("QS: ", nm), - title=paste0("CosMx: full QS vs ", nm), - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - paste0("cosmx_full_vs_", nm, "_A4_landscape.pdf") - ) - ) -}) - -names(plots_cosmx) <- ablation_names - -## Print plots -lapply(plots_cosmx, function(x) print(x$plot)) - - -## ============================================================ -## 8. Heatmap: complete ablation summary -## ============================================================ -## This heatmap includes correlations, score differences, and bottom-tail -## overlap metrics. - -heat_df <- tab_cosmx[tab_cosmx$model != "full", ] - -heat_df <- heat_df[, c( - "model", - "pearson", - "spearman", - "mean_abs_diff", - "median_abs_diff", - "bottom_jaccard", - "bottom_ref_recovered" -)] - -heat_long <- reshape( - heat_df, - varying=names(heat_df)[names(heat_df) != "model"], - v.names="value", - timevar="metric", - times=names(heat_df)[names(heat_df) != "model"], - direction="long" -) - -rownames(heat_long) <- NULL - -p_heat <- ggplot( - heat_long, - aes(x=metric, y=model, fill=value) -) + - geom_tile() + - geom_text( - aes(label=formatC(value, digits=3, format="f")), - color="white", - size=3 - ) + - labs( - title="CosMx QS ablation summary", - x="Metric", - y="Ablation model", - fill="Value" - ) + - theme_minimal() + - theme( - axis.text.x=element_text(angle=45, hjust=1) - ) - -p_heat - -ggsave( - filename=file.path( - out_dir, - "cosmx_ablation_summary_heatmap_A4_landscape.pdf" - ), - plot=p_heat, - device="pdf", - width=11.69, - height=8.27, - units="in" -) - - -## ============================================================ -## 9. Heatmap: correlation-only summary -## ============================================================ -## Cleaner figure showing only Pearson and Spearman correlations -## between the full QS and each ablated QS. - -cor_heat_df <- tab_cosmx[ - tab_cosmx$model != "full", - c("model", "pearson", "spearman") -] - -cor_heat_long <- reshape( - cor_heat_df, - varying=c("pearson", "spearman"), - v.names="correlation", - timevar="metric", - times=c("pearson", "spearman"), - direction="long" -) - -rownames(cor_heat_long) <- NULL - -p_cor_heat <- ggplot( - cor_heat_long, - aes(x=metric, y=model, fill=correlation) -) + - geom_tile() + - geom_text( - aes(label=formatC(correlation, digits=3, format="f")), - color="white", - size=4 - ) + - labs( - title="CosMx QS ablation correlation with full model", - x="Correlation metric", - y="Ablation model", - fill="Correlation" - ) + - theme_minimal() - -p_cor_heat - -ggsave( - filename=file.path( - out_dir, - "cosmx_ablation_correlation_heatmap.pdf" - ), - plot=p_cor_heat, - device="pdf", - width=8, - height=5, - units="in" -) - - -## ============================================================ -## 10. Optional: save tabular results -## ============================================================ - -write.csv( - tab_cosmx, - file=file.path(out_dir, "cosmx_ablation_summary_table.csv"), - row.names=FALSE -) - - -plot_ablation_colored <- function(ablation_results, ablation_name, - reference="full", color_col=NULL, - x_label="Full QS", y_label=NULL, title=NULL, - cor_method="spearman", output_file=NULL) { - - stopifnot(reference %in% names(ablation_results)) - stopifnot(ablation_name %in% names(ablation_results)) - - ref_score <- ablation_results[[reference]]$score - abl_score <- ablation_results[[ablation_name]]$score - - stopifnot(length(ref_score) == length(abl_score)) - - spe_ref <- ablation_results[[reference]]$spe - cd <- as.data.frame(colData(spe_ref)) - - df <- data.frame( - full_qs=ref_score, - ablated_qs=abl_score - ) - - if (!is.null(color_col)) { - stopifnot(color_col %in% colnames(cd)) - df$color_var <- cd[[color_col]] - } - - cor_val <- cor( - df$full_qs, - df$ablated_qs, - use="complete.obs", - method=cor_method - ) - - if (is.null(y_label)) { - y_label <- paste0("QS: ", ablation_name) - } - - if (is.null(title)) { - title <- paste0("Full QS vs ", ablation_name) - } - - if (is.null(color_col)) { - p <- ggplot(df, aes(x=full_qs, y=ablated_qs)) + - geom_point(alpha=0.35, size=0.6) - } else { - p <- ggplot(df, aes(x=full_qs, y=ablated_qs, color=color_var)) + - geom_point(alpha=0.35, size=0.6) - } - - p <- p + - geom_abline( - slope=1, - intercept=0, - linetype="dashed", - color="red" - ) + - labs( - x=x_label, - y=y_label, - title=title, - color=color_col - ) + - theme_minimal() + - annotate( - "text", - x=quantile(df$full_qs, 0.99, na.rm=TRUE), - y=quantile(df$ablated_qs, 0.01, na.rm=TRUE), - label=paste0( - cor_method, - " r = ", - formatC(cor_val, digits=3, format="f") - ), - hjust=1, - vjust=0, - color="black" - ) - - if (!is.null(output_file)) { - ggsave( - filename=output_file, - plot=p, - device="pdf", - width=11.69, - height=8.27, - units="in" - ) - } - - return(list( - plot=p, - correlation=cor_val, - data=df - )) -} - -p_no_signal_col <- plot_ablation_colored( - ablation_results=abl_cosmx, - ablation_name="no_log2SignalDensity", - color_col="log2Ctrl_total_ratio", - title="CosMx: full QS vs QS without log2SignalDensity", - y_label="QS without log2SignalDensity", - output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2SignalDensity_log2Ctrl_total_ratiocolored.pdf" -) -p_no_signal_col_area <- plot_ablation_colored( - ablation_results=abl_cosmx, - ablation_name="no_log2SignalDensity", - color_col="Area_um", - title="CosMx: full QS vs QS without log2SignalDensity", - y_label="QS without log2SignalDensity", - output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2SignalDensity_Area_um_colored.pdf" -) - -p_no_signal_col_aspect <- plot_ablation_colored( - ablation_results=abl_cosmx, - ablation_name="no_log2SignalDensity", - color_col="log2AspectRatio", - title="CosMx: full QS vs QS without log2SignalDensity", - y_label="QS without log2SignalDensity", - output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2SignalDensity_log2AspectRatio_colored.pdf" -) - -p_no_signal_col_aspect$plot -p_no_signal_col$plot - -plot_ablation_colored( - ablation_results=abl_cosmx, - ablation_name="no_log2Ctrl_total_ratio", - color_col="log2SignalDensity", - title="CosMx: full QS vs QS without log2Ctrl_total_ratio", - y_label="QS without log2Ctrl_total_ratio", - output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2ctrl_total_ratio_log2SignalDensitycolored.pdf" -)$plot - -plot_ablation_colored( - ablation_results=abl_cosmx, - ablation_name="no_log2Ctrl_total_ratio", - color_col="Area_um", - title="CosMx: full QS vs QS without log2Ctrl_total_ratio", - y_label="QS without log2Ctrl_total_ratio", - output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2Ctrl_total_ratio_Area_um_colored.pdf" -)$plot -plot_ablation_colored( - ablation_results=abl_cosmx, - ablation_name="no_log2Ctrl_total_ratio", - color_col="log2AspectRatio", - title="CosMx: full QS vs QS without log2Ctrl_total_ratio", - y_label="QS without log2Ctrl_total_ratio", - output_file="~/SpaceTrooper_qs_check/cosmx_full_vs_no_log2Ctrl_total_ratio_log2AspectRatio_colored.pdf" -)$plot diff --git a/inst/scripts/compare_qs_general.R b/inst/scripts/compare_qs_general.R deleted file mode 100644 index 60dd790..0000000 --- a/inst/scripts/compare_qs_general.R +++ /dev/null @@ -1,79 +0,0 @@ -args <- commandArgs(trailingOnly=TRUE) - -if (length(args) != 2) { - stop("Usage: Rscript compare_qs_general.R before.rds current.rds") -} - -before <- readRDS(args[[1]]) -current <- readRDS(args[[2]]) - -qs_before <- before$quality_score -qs_current <- current$quality_score - -cat("BEFORE\n") -cat("Branch:", before$git_branch, "\n") -cat("Commit:", before$git_hash, "\n") -cat("QScore column:", before$qscore_column, "\n") -cat("Detection:", before$qscore_detection_method, "\n\n") - -cat("CURRENT\n") -cat("Branch:", current$git_branch, "\n") -cat("Commit:", current$git_hash, "\n") -cat("QScore column:", current$qscore_column, "\n") -cat("Detection:", current$qscore_detection_method, "\n\n") - -cat("Lengths:\n") -cat("before:", length(qs_before), "\n") -cat("current:", length(qs_current), "\n\n") - -if (length(qs_before) != length(qs_current)) { - stop("Different number of Quality Score values. Cannot compare directly.") -} - -comparison <- data.frame( - index=seq_along(qs_before), - quality_score_before=qs_before, - quality_score_current=qs_current, - difference=qs_current - qs_before -) - -cat("Summary before:\n") -print(summary(qs_before)) - -cat("\nSummary current:\n") -print(summary(qs_current)) - -cat("\nSummary difference current - before:\n") -print(summary(comparison$difference)) - -cat("\nall.equal:\n") -print(all.equal(qs_before, qs_current)) - -cat("\nExactly different values:\n") -print(sum(qs_before != qs_current, na.rm=TRUE)) - -cat("\nDifferent above tolerance 1e-8:\n") -print(sum(abs(qs_current - qs_before) > 1e-8, na.rm=TRUE)) - -cat("\nDifferent above tolerance 1e-6:\n") -print(sum(abs(qs_current - qs_before) > 1e-6, na.rm=TRUE)) - -cat("\nDifferent above tolerance 1e-4:\n") -print(sum(abs(qs_current - qs_before) > 1e-4, na.rm=TRUE)) - -out_csv <- file.path( - dirname(args[[2]]), - "quality_score_comparison.csv" -) - -out_rds <- file.path( - dirname(args[[2]]), - "quality_score_comparison.rds" -) - -write.csv(comparison, out_csv, row.names=FALSE) -saveRDS(comparison, out_rds) - -cat("\nSaved:\n") -cat(out_csv, "\n") -cat(out_rds, "\n") diff --git a/inst/scripts/create_cosmx_rds.R b/inst/scripts/create_cosmx_rds.R deleted file mode 100644 index 152d9f8..0000000 --- a/inst/scripts/create_cosmx_rds.R +++ /dev/null @@ -1,39 +0,0 @@ -suppressPackageStartupMessages({ - library(devtools) - library(SpatialExperiment) - library(SummarizedExperiment) -}) - - -input_dir <- "/Users/inzirio/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2" - -output_rds <- "~/SpaceTrooper_qs_check/cosmx_case2_spe_raw.rds" -output_rds <- path.expand(output_rds) - - -library(SpaceTrooper) - -if (!dir.exists(input_dir)) { - stop("Input directory does not exist: ", input_dir) -} - -message("Reading CosMx dataset from:") -message(input_dir) - -spe <- readCosmxSPE(input_dir) - -message("Object class:") -print(class(spe)) - -message("Object dimensions:") -print(dim(spe)) - -message("colData columns:") -print(colnames(SummarizedExperiment::colData(spe))) - -message("Saving RDS to:") -message(output_rds) - -saveRDS(spe, output_rds) - -message("Done.") diff --git a/inst/scripts/evaluate_qs_split_kfold.R b/inst/scripts/evaluate_qs_split_kfold.R deleted file mode 100644 index faa4d1d..0000000 --- a/inst/scripts/evaluate_qs_split_kfold.R +++ /dev/null @@ -1,726 +0,0 @@ -library(SpaceTrooper) -library(SummarizedExperiment) -library(S4Vectors) - -# Apply a fitted ridge-logistic QS model to a new SpatialExperiment subset. -# -# The function rebuilds the design matrix from `colData`, using the same -# formula employed during training, and stores predicted probabilities in the -# `QC_score` column. -score_subset <- function(spe_subset, fit, lambda, model_formula) { - # Convert per-cell metadata to a plain data.frame so model.matrix can use it. - df_subset <- as.data.frame(SummarizedExperiment::colData(spe_subset)) - - # Recreate the exact predictor matrix used by glmnet. - x_subset <- stats::model.matrix(stats::as.formula(model_formula), - data=df_subset) - - # Predict the probability of being a good-quality cell. - spe_subset$QC_score <- as.vector( - stats::predict(fit, s=lambda, newx=x_subset, type="response") - ) - spe_subset -} - -# Compute AUROC from ranks, without requiring extra packages. -# -# `label_positive` must be a 0/1 vector, where 1 indicates the positive class. -# In this script we use it both for bad-vs-good and for derived summaries. -auc_rank <- function(score, label_positive) { - # Coerce labels to integer to avoid issues with logical/factor inputs. - label_positive <- as.integer(label_positive) - n_pos <- sum(label_positive == 1L) - n_neg <- sum(label_positive == 0L) - - # AUROC is undefined if one of the two classes is missing. - if (n_pos == 0L || n_neg == 0L) { - return(NA_real_) - } - - # Mann-Whitney / rank-based computation of AUROC. - ranks <- rank(score, ties.method="average") - (sum(ranks[label_positive == 1L]) - n_pos * (n_pos + 1) / 2) / - (n_pos * n_neg) -} - -# Build pseudo-reference labels on a subset, following the same SpaceTrooper -# logic used internally to define training examples. -# -# Returned labels are: -# - `qcscore_train = 0`: low-quality examples -# - `qcscore_train = 1`: good-quality examples -build_reference_labels <- function(spe_subset, verbose=FALSE) { - # Detect outliers for the metrics used by QS. - spe_ref <- computeOutliersQCScore(spe_subset) - - # Remove variables that do not have enough outliers to be informative. - spe_ref <- checkOutliers(spe_ref, verbose=verbose) - - # Build a balanced table of bad and good examples. - df_ref <- computeTrainDF( - colData=SummarizedExperiment::colData(spe_ref), - formulaVars=S4Vectors::metadata(spe_ref)$formula_variables, - tech=S4Vectors::metadata(spe_ref)$technology, - verbose=verbose - ) - - # Keep only the columns needed for downstream evaluation. - df_ref[, c("cell_id", "qcscore_train")] -} - -# Safe wrapper around `build_reference_labels()`. -# -# On small or unlucky splits/folds, SpaceTrooper may not find enough outliers -# to define a reference set. Instead of stopping the whole script, this returns -# the error message so the caller can inspect it. -safe_reference_labels <- function(spe_subset, verbose=FALSE) { - tryCatch( - list(data=build_reference_labels(spe_subset, verbose=verbose), - error=NULL), - error=function(e) list(data=NULL, error=conditionMessage(e)) - ) -} - -# Safely extract an optional scalar character field from a list. -# -# This is used when collecting fold-level error messages at the end of k-fold -# evaluation. Missing, NULL, or NA entries are converted to `NA_character_`. -extract_optional_chr1 <- function(x, name) { - value <- x[[name]] - - if (is.null(value) || length(value) == 0L || all(is.na(value))) { - return(NA_character_) - } - - as.character(value[[1]]) -} - -# Add good/bad/unlabelled labels to a SpatialExperiment subset for plotting. -# -# Labels are derived from the evaluation table: -# - `qcscore_train = 0` -> bad -# - `qcscore_train = 1` -> good -# - missing label -> unlabelled -append_eval_labels <- function( - spe_subset, - df_eval, - label_col="eval_label", - colour_col="eval_label_color" -) { - labels <- rep("unlabelled", ncol(spe_subset)) - - if (!is.null(df_eval) && nrow(df_eval) > 0L) { - idx <- match(spe_subset$cell_id, df_eval$cell_id) - keep <- !is.na(idx) - labels[keep] <- ifelse(df_eval$qcscore_train[idx[keep]] == 0, - "bad", "good") - } - - spe_subset[[label_col]] <- factor(labels, - levels=c("bad", "good", "unlabelled")) - - palette <- c( - bad="#D55E00", - good="#009E73", - unlabelled="#BDBDBD" - ) - spe_subset[[colour_col]] <- unname(palette[as.character(spe_subset[[label_col]])]) - spe_subset -} - -# Select either the whole result object (single split) or one fold result. -# -# This helper keeps the plotting functions compact and makes the API uniform -# across single-split and k-fold outputs. -get_result_level <- function(result, fold=NULL) { - if (identical(result$split_type, "k_fold_cv")) { - if (is.null(fold)) { - stop("Please provide `fold` when plotting a k-fold result") - } - if (length(fold) != 1L || is.na(fold) || fold < 1 || fold > length(result$folds)) { - stop("`fold` must be an integer between 1 and the number of folds") - } - return(result$folds[[fold]]) - } - - result -} - -# Plot cells coloured as good, bad, or unlabelled. -# -# By default the function uses centroids because they are always available in -# the current workflow. For k-fold results, choose which fold to display. -plot_labelled_cells <- function( - result, - dataset=c("train", "test"), - fold=NULL, - size=0.05, - alpha=0.8, - scaleBar=TRUE -) { - dataset <- match.arg(dataset) - level_result <- get_result_level(result, fold=fold) - - spe_subset <- switch(dataset, - train=level_result$spe_train, - test=level_result$spe_test - ) - df_eval <- switch(dataset, - train=level_result$train_eval, - test=level_result$test_eval - ) - - spe_subset <- append_eval_labels(spe_subset, df_eval) - - title_suffix <- if (identical(result$split_type, "k_fold_cv")) { - paste0(" - fold ", fold) - } else { - "" - } - - plotCentroids( - spe_subset, - colourBy="eval_label", - palette="eval_label_color", - size=size, - alpha=alpha, - scaleBar=scaleBar - ) + - ggplot2::labs( - title=paste0("Labelled cells (", dataset, ")", title_suffix), - colour="Reference label", - fill="Reference label" - ) -} - -# Compute the points of a ROC curve from evaluation labels and QS values. -# -# The positive class is the bad-quality group, so the curve is built using -# `1 - QC_score` as the decision score. -build_roc_df <- function(df_eval) { - if (is.null(df_eval) || nrow(df_eval) == 0L) { - return(data.frame()) - } - - label_positive <- as.integer(df_eval$qcscore_train == 0) - n_pos <- sum(label_positive == 1L) - n_neg <- sum(label_positive == 0L) - - if (n_pos == 0L || n_neg == 0L) { - return(data.frame()) - } - - score_bad <- 1 - df_eval$QC_score - ord <- order(score_bad, decreasing=TRUE) - score_bad <- score_bad[ord] - label_positive <- label_positive[ord] - - tp <- cumsum(label_positive == 1L) - fp <- cumsum(label_positive == 0L) - change <- c(score_bad[-1] != score_bad[-length(score_bad)], TRUE) - - data.frame( - fpr=c(0, fp[change] / n_neg), - tpr=c(0, tp[change] / n_pos) - ) -} - -# Plot ROC curves for a single split or across folds. -# -# In k-fold mode the function overlays one ROC curve per fold and reports the -# average AUC in the subtitle. -plot_roc_eval <- function(result, dataset=c("train", "test")) { - dataset <- match.arg(dataset) - - if (identical(result$split_type, "k_fold_cv")) { - roc_list <- lapply(seq_along(result$folds), function(i) { - df_eval <- switch(dataset, - train=result$folds[[i]]$train_eval, - test=result$folds[[i]]$test_eval - ) - roc_df <- build_roc_df(df_eval) - if (nrow(roc_df) == 0L) { - return(NULL) - } - roc_df$fold <- i - roc_df - }) - roc_list <- Filter(Negate(is.null), roc_list) - - if (length(roc_list) == 0L) { - stop("No ROC curves available for the selected dataset") - } - - roc_df <- do.call(rbind, roc_list) - auc_mean <- switch(dataset, - train=result$train_summary_average$auc_bad_vs_good, - test=result$test_summary_average$auc_bad_vs_good - ) - - return( - ggplot2::ggplot(roc_df, - ggplot2::aes(x=fpr, y=tpr, colour=factor(fold), group=fold)) + - ggplot2::geom_abline(intercept=0, slope=1, linetype=2, - colour="grey60") + - ggplot2::geom_path(linewidth=0.7, alpha=0.8) + - ggplot2::coord_fixed() + - ggplot2::theme_bw() + - ggplot2::labs( - title=paste0("ROC curves across folds (", dataset, ")"), - subtitle=paste0("Mean AUC = ", round(auc_mean, 4)), - x="False positive rate", - y="True positive rate", - colour="Fold" - ) - ) - } - - df_eval <- switch(dataset, - train=result$train_eval, - test=result$test_eval - ) - roc_df <- build_roc_df(df_eval) - - if (nrow(roc_df) == 0L) { - stop("No ROC curve available for the selected dataset") - } - - auc_value <- switch(dataset, - train=result$train_summary$auc_bad_vs_good, - test=result$test_summary$auc_bad_vs_good - ) - - ggplot2::ggplot(roc_df, ggplot2::aes(x=fpr, y=tpr)) + - ggplot2::geom_abline(intercept=0, slope=1, linetype=2, - colour="grey60") + - ggplot2::geom_path(linewidth=0.9, colour="#0072B2") + - ggplot2::coord_fixed() + - ggplot2::theme_bw() + - ggplot2::labs( - title=paste0("ROC curve (", dataset, ")"), - subtitle=paste0("AUC = ", round(auc_value, 4)), - x="False positive rate", - y="True positive rate" - ) -} - -# Plot AUC values for each fold. -# -# In single-split mode the function returns a one-point summary. In k-fold mode -# it shows one point per fold and a dashed line for the average AUC. -plot_auc_across_folds <- function(result, dataset=c("train", "test")) { - dataset <- match.arg(dataset) - - if (identical(result$split_type, "k_fold_cv")) { - auc_df <- switch(dataset, - train=result$train_summary_per_fold, - test=result$test_summary_per_fold - ) - auc_mean <- switch(dataset, - train=result$train_summary_average$auc_bad_vs_good, - test=result$test_summary_average$auc_bad_vs_good - ) - - return( - ggplot2::ggplot(auc_df, - ggplot2::aes(x=fold, y=auc_bad_vs_good)) + - ggplot2::geom_hline(yintercept=auc_mean, linetype=2, - colour="#D55E00") + - ggplot2::geom_line(colour="#0072B2") + - ggplot2::geom_point(size=2.2, colour="#0072B2") + - ggplot2::theme_bw() + - ggplot2::labs( - title=paste0("AUC across folds (", dataset, ")"), - subtitle=paste0("Dashed line = mean AUC = ", round(auc_mean, 4)), - x="Fold", - y="AUC" - ) - ) - } - - auc_value <- switch(dataset, - train=result$train_summary$auc_bad_vs_good, - test=result$test_summary$auc_bad_vs_good - ) - auc_df <- data.frame(split=1, auc_bad_vs_good=auc_value) - - ggplot2::ggplot(auc_df, ggplot2::aes(x=split, y=auc_bad_vs_good)) + - ggplot2::geom_point(size=3, colour="#0072B2") + - ggplot2::theme_bw() + - ggplot2::scale_x_continuous(breaks=1, labels="split") + - ggplot2::labs( - title=paste0("AUC summary (", dataset, ")"), - subtitle=paste0("AUC = ", round(auc_value, 4)), - x=NULL, - y="AUC" - ) -} - -# Summarise QS performance against pseudo-reference labels. -# -# The summary reports: -# - number of labelled cells -# - number of bad/good examples -# - median QS in bad and good examples -# - AUROC for separating bad from good -# - sensitivity/specificity at the chosen threshold -summarise_eval <- function(df_eval, threshold=0.5) { - # Return an empty data.frame if the evaluation table is missing. - if (is.null(df_eval) || nrow(df_eval) == 0L) { - return(data.frame()) - } - - # Derive boolean masks for the pseudo-reference classes. - is_bad <- df_eval$qcscore_train == 0 - is_good <- df_eval$qcscore_train == 1 - - # A cell is predicted as bad if its QS is below the chosen threshold. - pred_bad <- df_eval$QC_score < threshold - - data.frame( - n_labeled=nrow(df_eval), - n_bad=sum(is_bad), - n_good=sum(is_good), - median_qs_bad=if (sum(is_bad) > 0) { - stats::median(df_eval$QC_score[is_bad]) - } else { - NA_real_ - }, - median_qs_good=if (sum(is_good) > 0) { - stats::median(df_eval$QC_score[is_good]) - } else { - NA_real_ - }, - # `1 - QC_score` is used so that larger values correspond to worse cells. - auc_bad_vs_good=auc_rank(1 - df_eval$QC_score, as.integer(is_bad)), - sensitivity_bad_qs_lt_threshold=if (sum(is_bad) > 0) { - mean(pred_bad[is_bad]) - } else { - NA_real_ - }, - specificity_good_qs_ge_threshold=if (sum(is_good) > 0) { - mean(!pred_bad[is_good]) - } else { - NA_real_ - } - ) -} - -# Create train/test indices for a single random split. -# -# `train_fraction` is only used when `k_folds = 1`. It controls the proportion -# of cells assigned to the training set, allowing e.g. 50/50, 70/30, 80/20. -make_random_split <- function(n_cells, train_fraction, seed) { - stopifnot(length(n_cells) == 1L, n_cells > 1L) - stopifnot(length(train_fraction) == 1L, !is.na(train_fraction)) - - if (train_fraction <= 0 || train_fraction >= 1) { - stop("train_fraction must be strictly between 0 and 1") - } - - set.seed(seed) - - # Sample the training cells at random. - n_train <- floor(n_cells * train_fraction) - if (n_train < 1L || n_train >= n_cells) { - stop("train_fraction produces an empty training or test set") - } - - train_idx <- sample(seq_len(n_cells), size=n_train, replace=FALSE) - test_idx <- setdiff(seq_len(n_cells), train_idx) - - list(train_idx=train_idx, test_idx=test_idx) -} - -# Assign each cell to one of the k folds used in cross-validation. -# -# Each fold acts as test set once, while the remaining folds are used for -# training. This function is only used when `k_folds > 1`. -make_fold_ids <- function(n_cells, k_folds, seed) { - stopifnot(length(n_cells) == 1L, n_cells > 1L) - stopifnot(length(k_folds) == 1L, !is.na(k_folds), k_folds >= 1) - - k_folds <- as.integer(k_folds) - if (k_folds < 1L) { - stop("k_folds must be >= 1") - } - - set.seed(seed) - - if (k_folds > n_cells) { - stop("k_folds cannot be greater than the number of cells") - } - - # Shuffle cells and then distribute them approximately evenly across folds. - shuffled <- sample(seq_len(n_cells)) - fold_id <- integer(n_cells) - fold_id[shuffled] <- rep(seq_len(k_folds), length.out=n_cells) - fold_id -} - -# Train the QS model on one training split/fold and evaluate it on both train -# and test cells. -# -# This is the core routine used by both single-split validation and k-fold CV. -run_one_fold <- function(spe, test_idx, threshold=0.5, verbose=TRUE) { - # Partition the input SpatialExperiment into training and test subsets. - spe_train <- spe[, -test_idx] - spe_test <- spe[, test_idx] - - # Compute outliers only on training cells, so the model is fitted without - # information leakage from the test set. - spe_train <- computeOutliersQCScore(spe_train) - spe_train <- checkOutliers(spe_train, verbose=verbose) - - # Extract the training formula selected by SpaceTrooper. - formula_vars <- S4Vectors::metadata(spe_train)$formula_variables - model_formula <- getModelFormula(formula_vars, verbose=verbose) - - # Create the balanced training table of pseudo-bad and pseudo-good cells. - df_train <- computeTrainDF( - colData=SummarizedExperiment::colData(spe_train), - formulaVars=formula_vars, - tech=S4Vectors::metadata(spe_train)$technology, - verbose=verbose - ) - - # Build the design matrix and estimate the ridge penalty. - x_train <- stats::model.matrix(stats::as.formula(model_formula), data=df_train) - best_lambda <- computeLambda(df_train, model_formula) - - # Fit the ridge logistic model. - fit <- trainModel(x_train, df_train) - - # Score both the training and test subsets using the trained model. - spe_train <- score_subset(spe_train, fit, best_lambda, model_formula) - spe_test <- score_subset(spe_test, fit, best_lambda, model_formula) - - # Training evaluation uses the very same labelled examples used for fitting. - train_eval <- merge( - data.frame(cell_id=spe_train$cell_id, QC_score=spe_train$QC_score), - df_train[, c("cell_id", "qcscore_train")], - by="cell_id" - ) - - # On the test set, build pseudo-reference labels independently. - test_ref <- safe_reference_labels(spe_test, verbose=verbose) - test_eval <- if (is.null(test_ref$data)) { - NULL - } else { - merge( - data.frame(cell_id=spe_test$cell_id, QC_score=spe_test$QC_score), - test_ref$data, - by="cell_id" - ) - } - - # Return both summaries and raw objects/tables for further inspection. - list( - formula_variables=formula_vars, - model_formula=model_formula, - lambda=best_lambda, - train_class_balance=table(df_train$qcscore_train), - train_label_error=NA_character_, - train_summary=summarise_eval(train_eval, threshold=threshold), - test_summary=summarise_eval(test_eval, threshold=threshold), - test_label_error=test_ref$error, - spe_train=spe_train, - spe_test=spe_test, - train_eval=train_eval, - test_eval=test_eval - ) -} - -# Combine per-fold summaries into a single table and compute their mean. -# -# This is only used in k-fold mode. The output contains: -# - `per_fold`: one row per fold -# - `average`: average across folds for each metric -combine_fold_summaries <- function(fold_results, summary_name) { - summaries <- lapply(seq_along(fold_results), function(i) { - x <- fold_results[[i]][[summary_name]] - if (is.null(x) || nrow(x) == 0L) { - return(NULL) - } - x$fold <- i - x - }) - summaries <- Filter(Negate(is.null), summaries) - - if (length(summaries) == 0L) { - return(list(per_fold=data.frame(), average=data.frame())) - } - - per_fold <- do.call(rbind, summaries) - metric_cols <- setdiff(colnames(per_fold), "fold") - - # Average each numeric summary metric across folds. - average <- as.data.frame( - lapply(per_fold[, metric_cols, drop=FALSE], function(x) mean(x, na.rm=TRUE)) - ) - - list(per_fold=per_fold, average=average) -} - -# Main entry point for evaluating SpaceTrooper QS on a CosMx dataset. -# -# Modes: -# - `k_folds = 1`: single random split using `train_fraction` -# - `k_folds > 1`: k-fold cross-validation -# -# Important notes: -# - `train_fraction` is ignored when `k_folds > 1` -# - `threshold` is used only for classification summaries, not for model fitting -evaluate_qs_split <- function( - data_dir="/Users/inzirio/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2", - sample_name="CosMx_Case2", - seed=1713, - threshold=0.5, - k_folds=1, - train_fraction=0.5, - verbose=TRUE -) { - # Validate the high-level input parameters. - stopifnot(length(k_folds) == 1L, !is.na(k_folds), k_folds >= 1) - k_folds <- as.integer(k_folds) - stopifnot(length(train_fraction) == 1L, !is.na(train_fraction)) - - if (train_fraction <= 0 || train_fraction >= 1) { - stop("train_fraction must be strictly between 0 and 1") - } - - # Read the CosMx dataset from disk. - spe <- readCosmxSPE(data_dir, sampleName=sample_name) - - # Compute the per-cell QC metrics required by the QS model. - spe <- spatialPerCellQC( - spe, - rmZeros=TRUE, - negProbList=c("NegPrb", "Negative", "SystemControl") - ) - - # Count the number of cells remaining after QC preprocessing. - n_cells <- ncol(spe) - - if (k_folds == 1L) { - # Single random split: build train/test indices according to the - # requested training fraction. - split_idx <- make_random_split( - n_cells=n_cells, - train_fraction=train_fraction, - seed=seed - ) - - res <- run_one_fold( - spe=spe, - test_idx=split_idx$test_idx, - threshold=threshold, - verbose=verbose - ) - - return(c( - list( - split_type="random_split", - k_folds=k_folds, - seed=seed, - threshold=threshold, - train_fraction=train_fraction, - test_fraction=1 - train_fraction, - n_cells=n_cells, - n_train=length(split_idx$train_idx), - n_test=length(split_idx$test_idx) - ), - res - )) - } - - # K-fold mode: assign each cell to a fold once. - fold_id <- make_fold_ids(n_cells=n_cells, k_folds=k_folds, seed=seed) - - fold_results <- lapply(seq_len(k_folds), function(i) { - if (verbose) { - message("Running fold ", i, " of ", k_folds) - } - - # Fold i is the test set; all other folds are the training set. - run_one_fold( - spe=spe, - test_idx=which(fold_id == i), - threshold=threshold, - verbose=verbose - ) - }) - - train_combined <- combine_fold_summaries(fold_results, "train_summary") - test_combined <- combine_fold_summaries(fold_results, "test_summary") - test_label_errors <- vapply( - fold_results, - extract_optional_chr1, - character(1), - name = "test_label_error" - ) - - train_label_errors <- vapply( - fold_results, - extract_optional_chr1, - character(1), - name = "train_label_error" - ) - - # if you only want real errors later, drop missing values explicitly - test_label_errors <- stats::na.omit(test_label_errors) - train_label_errors <- stats::na.omit(train_label_errors) - - list( - split_type="k_fold_cv", - k_folds=k_folds, - seed=seed, - threshold=threshold, - n_cells=n_cells, - fold_assignment=fold_id, - train_summary_per_fold=train_combined$per_fold, - train_summary_average=train_combined$average, - test_summary_per_fold=test_combined$per_fold, - test_summary_average=test_combined$average, - test_label_errors=test_label_errors, - train_label_errors=train_label_errors, - folds=fold_results - ) -} - -# Esempi di utilizzo: -# source("inst/scripts/evaluate_qs_split_kfold.R") -# -# Split singolo 50/50 -# res_split_50_50 <- evaluate_qs_split(k_folds=1, train_fraction=0.50) -# res_split_50_50$test_summary -# -# Split singolo 70/30 -# res_split_70_30 <- evaluate_qs_split(k_folds=1, train_fraction=0.70) -# res_split_70_30$test_summary -res_split <- evaluate_qs_split(k_folds=1, train_fraction=0.70) - -plot_labelled_cells(res_split, dataset="train", size=0.08) -plot_labelled_cells(res_split, dataset="test", size=0.08) - -plot_roc_eval(res_split, dataset="test") -plot_auc_across_folds(res_split, dataset="test") -# -# Split singolo 80/20 -# res_split_80_20 <- evaluate_qs_split(k_folds=1, train_fraction=0.80) -# res_split_80_20$test_summary -# -# 5-fold cross-validation -res_kfold_5 <- evaluate_qs_split(k_folds=5) -res_kfold_5$test_summary_per_fold -res_kfold_5$test_summary_average -res_kfold_5$test_label_errors -# -# Celle colorate good/bad sul test set del fold 1 -plot_labelled_cells(res_kfold_5, dataset="test", fold=1, size=0.1) -# -# ROC curve across folds -plot_roc_eval(res_kfold_5, dataset="test") -# -# AUC across folds -plot_auc_across_folds(res_kfold_5, dataset="test") diff --git a/inst/scripts/run_qs_general.R b/inst/scripts/run_qs_general.R deleted file mode 100644 index 607cf52..0000000 --- a/inst/scripts/run_qs_general.R +++ /dev/null @@ -1,275 +0,0 @@ - -package_path <- "/Users/inzirio/My Drive/works/coding/SpaceTrooper" -input_rds <- "~/SpaceTrooper_qs_check/cosmx_case2_spe_raw.rds" -output_rds <- "~/SpaceTrooper_qs_check/cosmx_case2_qs_result_interaction.rds" - - -message("Package path: ", package_path) -message("Input RDS: ", input_rds) -message("Output RDS: ", output_rds) - -library(devtools) -library(SpatialExperiment) - -devtools::load_all(package_path, quiet=FALSE) - - -obj <- readRDS(input_rds) -metadata(obj)$formula_variables <- c(log2CountArea="log2CountArea", - log2AspectRatio="log2AspectRatio") -obj <- spatialPerCellQC(obj) - -metadata(obj) -obj <- computeQCScore(obj) - -qc_nointeract <- obj$QC_score -qc_interact <- obj$QC_score -saveRDS(qc_nointeract, "~/SpaceTrooper_qs_check/qc_nointeract.rds") -saveRDS(qc_interact, "~/SpaceTrooper_qs_check/qc_interact.rds") - -library(ggplot2) - -# assume qc_nointeract and qc_interact exist in the environment -df <- data.frame(qc_nointeract = qc_nointeract, qc_interact = qc_interact) - -# correlation -cor_val <- cor(df$qc_nointeract, df$qc_interact, use = "complete.obs", method = "pearson") - -# plot: use 2D binning for large datasets, points otherwise; add y=x line and annotation -plot_qc <- ggplot(df, aes(x = qc_nointeract, y = qc_interact)) + - geom_point(alpha = 0.35, size = 0.6) + - geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") + -# scale_fill_gradient(low = "white", high = "steelblue") + - labs( - x = "QC score (no interaction)", - y = "QC score (with interaction)", - title = "QC: no-interaction vs interaction" - ) + - theme_minimal() + - geom_hline(yintercept = 0.75) + - geom_vline(xintercept = 0.75) + - annotate( - "text", - x = quantile(df$qc_nointeract, 0.99, na.rm = TRUE), - y = quantile(df$qc_interact, 0.01, na.rm = TRUE), - label = paste0("r = ", formatC(cor_val, digits = 3, format = "f")), - hjust = 1, vjust = 0 - ) - -# return the plot object -plot_qc - - -library(ggplot2) - -df <- data.frame( - qc_nointeract=qc_nointeract, - qc_interact=qc_interact -) - -threshold <- 0.75 - -df$quadrant <- with( - df, - ifelse( - qc_nointeract < threshold & qc_interact < threshold, - "low / low", - ifelse( - qc_nointeract >= threshold & qc_interact < threshold, - "high no-interaction / low interaction", - ifelse( - qc_nointeract < threshold & qc_interact >= threshold, - "low no-interaction / high interaction", - "high / high" - ) - ) - ) -) - -table(df$quadrant) - -cor_val <- cor( - df$qc_nointeract, - df$qc_interact, - use="complete.obs", - method="pearson" -) - -plot_qc <- ggplot(df, aes(x=qc_nointeract, y=qc_interact, color=quadrant)) + - geom_point(alpha=0.35, size=0.6) + - geom_abline( - slope=1, - intercept=0, - linetype="dashed", - color="red" - ) + - geom_hline(yintercept=threshold) + - geom_vline(xintercept=threshold) + - labs( - x="QC score (no interaction)", - y="QC score (with interaction)", - title="QC: no-interaction vs interaction", - color="Quadrant" - ) + - theme_minimal() + - annotate( - "text", - x=quantile(df$qc_nointeract, 0.99, na.rm=TRUE), - y=quantile(df$qc_interact, 0.01, na.rm=TRUE), - label=paste0("r = ", formatC(cor_val, digits=3, format="f")), - hjust=1, - vjust=0, - color="black" - ) - -plot_qc - -plot_qc <- plot_qc + - scale_color_manual( - values=c( - "low / low"="grey60", - "high no-interaction / low interaction"="orange", - "low no-interaction / high interaction"="dodgerblue", - "high / high"="black" - ) - ) - -ggsave( - filename="~/SpaceTrooper_qs_check/qc_nointeraction_vs_interaction_A4_landscape.pdf", - plot=plot_qc, - device="pdf", - width=11.69, - height=8.27, - units="in" -) -# get_coldata_df <- function(x) { -# cd <- SummarizedExperiment::colData(x) -# as.data.frame(cd) -# } - -# get_qscore_from_coldata <- function(x) { -# cd <- get_coldata_df(x) - -# candidate_names <- c( -# "quality_score", -# "QualityScore", -# "qualityScore", -# "qscore", -# "QScore", -# "Q_score", -# "q_score", -# "score", -# "flag_score", -# "qc_score", -# "QC_score", -# "QCScore", -# "cell_quality_score", -# "spatial_quality_score" -# ) - -# exact_hits <- intersect(candidate_names, colnames(cd)) - -# if (length(exact_hits) > 0) { -# selected <- exact_hits[1] -# return(list( -# values=cd[[selected]], -# column=selected, -# method="exact_name_match", -# all_candidates=exact_hits, -# coldata=cd -# )) -# } - -# pattern_hits <- grep( -# "quality|qscore|q_score|qc_score|flag_score|score", -# colnames(cd), -# ignore.case=TRUE, -# value=TRUE -# ) - -# numeric_pattern_hits <- pattern_hits[ -# vapply(cd[pattern_hits], is.numeric, logical(1)) -# ] - -# if (length(numeric_pattern_hits) > 0) { -# selected <- numeric_pattern_hits[1] -# return(list( -# values=cd[[selected]], -# column=selected, -# method="pattern_numeric_match", -# all_candidates=numeric_pattern_hits, -# coldata=cd -# )) -# } - -# stop( -# "Could not automatically identify Quality Score column.\n", -# "Available colData columns are:\n", -# paste(colnames(cd), collapse=", ") -# ) -# } - -# call_function_if_exists <- function(fun_name, x) { -# if (!exists(fun_name, mode="function")) { -# stop("Function not found in loaded SpaceTrooper branch: ", fun_name) -# } - -# fun <- get(fun_name, mode="function") -# fun(x) -# } - -# obj_before <- obj - -# if (run_spatial_qc) { -# message("Running spatial QC function...") -# obj <- call_function_if_exists(spatial_qc_fun, obj) -# } - -# message("Running Quality Score function...") -# obj_after <- call_function_if_exists(qscore_fun, obj) - -# message("Extracting Quality Score...") -# qs_info <- get_qscore_from_coldata(obj_after) - -# qs <- qs_info$values - -# if (!is.numeric(qs)) { -# warning("Detected Quality Score column is not numeric: ", qs_info$column) -# } - -# out <- list( -# git_branch=git_branch, -# git_hash=git_hash, -# package_path=normalizePath(package_path), -# input_rds=normalizePath(input_rds), -# spatial_qc_fun=spatial_qc_fun, -# qscore_fun=qscore_fun, -# run_spatial_qc=run_spatial_qc, -# qscore_column=qs_info$column, -# qscore_detection_method=qs_info$method, -# qscore_candidates=qs_info$all_candidates, -# n=length(qs), -# object_class=class(obj_after), -# object_dim=tryCatch(dim(obj_after), error=function(e) NULL), -# object_colnames=tryCatch(colnames(obj_after), error=function(e) NULL), -# quality_score=qs, -# quality_score_summary=summary(qs), -# colData=qs_info$coldata, -# session_info=sessionInfo() -# ) - -# saveRDS(out, output_rds) - -# message("Saved result to: ", output_rds) -# message("Detected Quality Score column: ", qs_info$column) -# message("Detection method: ", qs_info$method) -# message("Number of values: ", length(qs)) -# print(summary(qs)) - - -# computeQCScore(obj) - -# # model_formula <- paste0("~ log2CountArea + I(abs(log2AspectRatio) ", -# # "* as.numeric(dist_border<50)) + ", -# # " log2CountArea:I(abs(log2AspectRatio)", -# # "* as.numeric(dist_border<50))") #for cosmx diff --git a/inst/scripts/transfer_coeffs_cosmx_cosmx.R b/inst/scripts/transfer_coeffs_cosmx_cosmx.R deleted file mode 100644 index d6d0aea..0000000 --- a/inst/scripts/transfer_coeffs_cosmx_cosmx.R +++ /dev/null @@ -1,730 +0,0 @@ -## ============================================================ -## 0. Libraries and output directory -## ============================================================ - -devtools::load_all() -library(ggplot2) - -out_dir <- "~/SpaceTrooper_qs_check" - -set.seed(1998) - - -## ============================================================ -## 1. Define common CosMx model formula -## ============================================================ -## Since both datasets are CosMx, we can use the CosMx-specific formula, -## including the border-adjusted aspect ratio term and control ratio. - -cosmx_formula <- paste0( - "~(", - "log2SignalDensity + ", - "Area_um + ", - "I(abs(log2AspectRatio) * as.numeric(dist_border < 50)) + ", - "log2Ctrl_total_ratio", - ")^2" -) - - -## ============================================================ -## 2. Read and preprocess CosMx Breast dataset -## ============================================================ - -cosmx_breast_path <- "~/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2/" - -specosm_breast <- readCosmxSPE(cosmx_breast_path) - -specosm_breast <- spatialPerCellQC(specosm_breast) - - -## ============================================================ -## 3. Read and preprocess CosMx Pancreas dataset -## ============================================================ - -cosmx_pancreas_path <- "/Users/inzirio/Downloads/CosMx_data/Pancreas-CosMx-WTx" - -specosm_pancreas <- readCosmxSPE(cosmx_pancreas_path) - -specosm_pancreas <- spatialPerCellQC(specosm_pancreas) - - -## ============================================================ -## 4. Train native QS models on both datasets -## ============================================================ - -specosm_breast <- computeQCScore( - spe=specosm_breast, - modelFormula=cosmx_formula, - verbose=TRUE -) - -specosm_pancreas <- computeQCScore( - spe=specosm_pancreas, - modelFormula=cosmx_formula, - verbose=TRUE -) - - -## ============================================================ -## 5. Store native QS scores and trained models -## ============================================================ - -specosm_breast$QC_score_breast_native <- specosm_breast$QC_score -specosm_pancreas$QC_score_pancreas_native <- specosm_pancreas$QC_score - -breast_model <- metadata(specosm_breast)$QCScore_model -pancreas_model <- metadata(specosm_pancreas)$QCScore_model - - -## ============================================================ -## 6. Apply transferred models -## ============================================================ -## Apply Pancreas-trained model to Breast. -## Apply Breast-trained model to Pancreas. - -specosm_breast <- applyQCScoreModel( - spe=specosm_breast, - qcModel=pancreas_model, - scoreName="QC_score_pancreas_model" -) - -specosm_pancreas <- applyQCScoreModel( - spe=specosm_pancreas, - qcModel=breast_model, - scoreName="QC_score_breast_model" -) - - -## ============================================================ -## 7. Basic checks -## ============================================================ - -summary(specosm_breast$QC_score_breast_native) -summary(specosm_breast$QC_score_pancreas_model) - -summary(specosm_pancreas$QC_score_pancreas_native) -summary(specosm_pancreas$QC_score_breast_model) - -range(specosm_breast$QC_score_breast_native, na.rm=TRUE) -range(specosm_breast$QC_score_pancreas_model, na.rm=TRUE) - -range(specosm_pancreas$QC_score_pancreas_native, na.rm=TRUE) -range(specosm_pancreas$QC_score_breast_model, na.rm=TRUE) - -sum(is.na(specosm_breast$QC_score_breast_native)) -sum(is.na(specosm_breast$QC_score_pancreas_model)) - -sum(is.na(specosm_pancreas$QC_score_pancreas_native)) -sum(is.na(specosm_pancreas$QC_score_breast_model)) - - -## ============================================================ -## 8. Correlations: native vs transferred -## ============================================================ - -cor_breast_spearman <- cor( - specosm_breast$QC_score_breast_native, - specosm_breast$QC_score_pancreas_model, - use="complete.obs", - method="spearman" -) - -cor_breast_pearson <- cor( - specosm_breast$QC_score_breast_native, - specosm_breast$QC_score_pancreas_model, - use="complete.obs", - method="pearson" -) - -cor_pancreas_spearman <- cor( - specosm_pancreas$QC_score_pancreas_native, - specosm_pancreas$QC_score_breast_model, - use="complete.obs", - method="spearman" -) - -cor_pancreas_pearson <- cor( - specosm_pancreas$QC_score_pancreas_native, - specosm_pancreas$QC_score_breast_model, - use="complete.obs", - method="pearson" -) - -cor_breast_spearman -cor_breast_pearson - -cor_pancreas_spearman -cor_pancreas_pearson - - -## ============================================================ -## 9. Plot: Breast native QS vs Pancreas-trained QS -## ============================================================ - -res_breast <- plot_qc_score_comparison( - x=specosm_breast$QC_score_breast_native, - y=specosm_breast$QC_score_pancreas_model, - x_label="Breast native QS", - y_label="Pancreas-trained QS", - title="CosMx Breast: native QS vs Pancreas-trained QS", - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - "cosmx_breast_native_vs_pancreas_model_A4_landscape.pdf" - ) -) - -res_breast$plot -res_breast$correlation -res_breast$quadrant_table - - -## ============================================================ -## 10. Plot: Pancreas native QS vs Breast-trained QS -## ============================================================ - -res_pancreas <- plot_qc_score_comparison( - x=specosm_pancreas$QC_score_pancreas_native, - y=specosm_pancreas$QC_score_breast_model, - x_label="Pancreas native QS", - y_label="Breast-trained QS", - title="CosMx Pancreas: native QS vs Breast-trained QS", - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - "cosmx_pancreas_native_vs_breast_model_A4_landscape.pdf" - ) -) - -res_pancreas$plot -res_pancreas$correlation -res_pancreas$quadrant_table - - -## ============================================================ -## 11. Compare model formulas and coefficients -## ============================================================ - -metadata(specosm_breast)$QCScore_model$model_formula -metadata(specosm_pancreas)$QCScore_model$model_formula - -metadata(specosm_breast)$QCScore_model$model_matrix_colnames -metadata(specosm_pancreas)$QCScore_model$model_matrix_colnames - -metadata(specosm_breast)$QCScore_model$coefficients_table -metadata(specosm_pancreas)$QCScore_model$coefficients_table - - -## ============================================================ -## 12. Save summary table -## ============================================================ - -cosmx_transfer_summary <- data.frame( - comparison=c( - "Breast native vs Pancreas-trained", - "Pancreas native vs Breast-trained" - ), - spearman=c( - cor_breast_spearman, - cor_pancreas_spearman - ), - pearson=c( - cor_breast_pearson, - cor_pancreas_pearson - ), - stringsAsFactors=FALSE -) - -cosmx_transfer_summary - -write.csv( - cosmx_transfer_summary, - file=file.path(out_dir, "cosmx_breast_pancreas_transfer_summary.csv"), - row.names=FALSE -) - -plot_qc_score_coldata <- function(x, y, spe, color_col, - x_label="x", y_label="y", - title="QC score comparison", - threshold=0.75, - cor_method="spearman", - output_file=NULL) { - - stopifnot(length(x) == length(y)) - stopifnot(ncol(spe) == length(x)) - stopifnot(color_col %in% colnames(colData(spe))) - - df <- data.frame( - x=x, - y=y, - color_var=as.data.frame(colData(spe))[[color_col]] - ) - - cor_val <- cor( - df$x, - df$y, - use="complete.obs", - method=cor_method - ) - - p <- ggplot(df, aes(x=x, y=y, color=color_var)) + - geom_point(alpha=0.35, size=0.6) + - geom_abline( - slope=1, - intercept=0, - linetype="dashed", - color="red" - ) + - geom_hline(yintercept=threshold) + - geom_vline(xintercept=threshold) + - labs( - x=x_label, - y=y_label, - title=title, - color=color_col - ) + - theme_minimal() + - annotate( - "text", - x=quantile(df$x, 0.99, na.rm=TRUE), - y=quantile(df$y, 0.01, na.rm=TRUE), - label=paste0( - cor_method, - " r = ", - formatC(cor_val, digits=3, format="f") - ), - hjust=1, - vjust=0, - color="black" - ) - - if (!is.null(output_file)) { - ggsave( - filename=output_file, - plot=p, - device="pdf", - width=11.69, - height=8.27, - units="in" - ) - } - - return(list( - plot=p, - correlation=cor_val, - data=df - )) -} - - -res_breast_signal <- plot_qc_score_coldata( - x=specosm_breast$QC_score_breast_native, - y=specosm_breast$QC_score_pancreas_model, - spe=specosm_breast, - color_col="log2SignalDensity", - x_label="Breast native QS", - y_label="Pancreas-trained QS", - title="CosMx Breast: native QS vs Pancreas-trained QS", - threshold=0.75, - cor_method="spearman", - output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_log2SignalDensity.pdf" -) - -res_breast_signal$plot - -res_breast_area <- plot_qc_score_coldata( - x=specosm_breast$QC_score_breast_native, - y=specosm_breast$QC_score_pancreas_model, - spe=specosm_breast, - color_col="Area_um", - x_label="Breast native QS", - y_label="Pancreas-trained QS", - title="CosMx Breast: native QS vs Pancreas-trained QS", - threshold=0.75, - cor_method="spearman", - output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_Area_um.pdf" -) - -res_breast_area$plot - -res_breast_ctrl <- plot_qc_score_coldata( - x=specosm_breast$QC_score_breast_native, - y=specosm_breast$QC_score_pancreas_model, - spe=specosm_breast, - color_col="log2Ctrl_total_ratio", - x_label="Breast native QS", - y_label="Pancreas-trained QS", - title="CosMx Breast: native QS vs Pancreas-trained QS", - threshold=0.75, - cor_method="spearman", - output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_log2Ctrl_total_ratio.pdf" -) - -res_breast_ctrl$plot - -res_breast_aspect <- plot_qc_score_coldata( - x=specosm_breast$QC_score_breast_native, - y=specosm_breast$QC_score_pancreas_model, - spe=specosm_breast, - color_col="log2AspectRatio", - x_label="Breast native QS", - y_label="Pancreas-trained QS", - title="CosMx Breast: native QS vs Pancreas-trained QS", - threshold=0.75, - cor_method="spearman", - output_file="~/SpaceTrooper_qs_check/cosmx_breast_native_vs_pancreas_model_colored_by_log2AspectRatio.pdf" -) - -res_breast_aspect$plot - -SpaceTrooper::plotMetricHist(spe = specosm_breast, metric="log2Ctrl_total_ratio") -SpaceTrooper::plotMetricHist(spe = specosm_pancreas, metric="log2Ctrl_total_ratio") - -############################################################ -specosm_mb1 <- readCosmxSPE("/Users/inzirio/Downloads/CosMx_data/CosMx1k_MouseBrain1") -specosm_mb2 <- readCosmxSPE("/Users/inzirio/Downloads/CosMx_data/CosMx1k_MouseBrain2") - - -specosm_mb1 <- spatialPerCellQC(specosm_mb1) -specosm_mb2 <- spatialPerCellQC(specosm_mb2) - -cosmx_formula <- paste0( - "~(", - "log2SignalDensity + ", - "Area_um + ", - "I(abs(log2AspectRatio) * as.numeric(dist_border < 50)) + ", - "log2Ctrl_total_ratio", - ")^2" -) - -specosm_mb1 <- computeQCScore( - spe=specosm_mb1, - modelFormula=cosmx_formula, - verbose=TRUE -) - -specosm_mb2 <- computeQCScore( - spe=specosm_mb2, - modelFormula=cosmx_formula, - verbose=TRUE -) - -specosm_mb1$QC_score_mb1_native <- specosm_mb1$QC_score -specosm_mb2$QC_score_mb2_native <- specosm_mb2$QC_score - -mb1_model <- metadata(specosm_mb1)$QCScore_model -mb2_model <- metadata(specosm_mb2)$QCScore_model - -specosm_mb1 <- applyQCScoreModel( - spe=specosm_mb1, - qcModel=mb2_model, - scoreName="QC_score_mb2_model" -) - -specosm_mb2 <- applyQCScoreModel( - spe=specosm_mb2, - qcModel=mb1_model, - scoreName="QC_score_mb1_model" -) - -cor_mb1_spearman <- cor( - specosm_mb1$QC_score_mb1_native, - specosm_mb1$QC_score_mb2_model, - use="complete.obs", - method="spearman" -) - -cor_mb1_pearson <- cor( - specosm_mb1$QC_score_mb1_native, - specosm_mb1$QC_score_mb2_model, - use="complete.obs", - method="pearson" -) - -cor_mb2_spearman <- cor( - specosm_mb2$QC_score_mb2_native, - specosm_mb2$QC_score_mb1_model, - use="complete.obs", - method="spearman" -) - -cor_mb2_pearson <- cor( - specosm_mb2$QC_score_mb2_native, - specosm_mb2$QC_score_mb1_model, - use="complete.obs", - method="pearson" -) - -res_mb1 <- plot_qc_score_comparison( - x=specosm_mb1$QC_score_mb1_native, - y=specosm_mb1$QC_score_mb2_model, - x_label="MouseBrain1 native QS", - y_label="MouseBrain2-trained QS", - title="CosMx MouseBrain1: native QS vs MouseBrain2-trained QS", - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - "cosmx_mousebrain1_native_vs_mousebrain2_model_A4_landscape.pdf" - ) -) - -res_mb2 <- plot_qc_score_comparison( - x=specosm_mb2$QC_score_mb2_native, - y=specosm_mb2$QC_score_mb1_model, - x_label="MouseBrain2 native QS", - y_label="MouseBrain1-trained QS", - title="CosMx MouseBrain2: native QS vs MouseBrain1-trained QS", - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - "cosmx_mousebrain2_native_vs_mousebrain1_model_A4_landscape.pdf" - ) -) - -res_mb1$plot -res_mb2$plot - -color_vars <- c( - "log2SignalDensity", - "Area_um", - "log2AspectRatio", - "log2Ctrl_total_ratio" -) - -mb1_colored_plots <- lapply(color_vars, function(v) { - plot_qc_score_coldata( - x=specosm_mb1$QC_score_mb1_native, - y=specosm_mb1$QC_score_mb2_model, - spe=specosm_mb1, - color_col=v, - x_label="MouseBrain1 native QS", - y_label="MouseBrain2-trained QS", - title=paste0("MouseBrain1 native vs MouseBrain2-trained QS: ", v), - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - paste0("cosmx_mousebrain1_native_vs_mousebrain2_model_", v, ".pdf") - ) - ) -}) -names(mb1_colored_plots) <- color_vars - -mb2_colored_plots <- lapply(color_vars, function(v) { - plot_qc_score_coldata( - x=specosm_mb2$QC_score_mb2_native, - y=specosm_mb2$QC_score_mb1_model, - spe=specosm_mb2, - color_col=v, - x_label="MouseBrain2 native QS", - y_label="MouseBrain1-trained QS", - title=paste0("MouseBrain2 native vs MouseBrain1-trained QS: ", v), - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - paste0("cosmx_mousebrain2_native_vs_mousebrain1_model_", v, ".pdf") - ) - ) -}) -names(mb2_colored_plots) <- color_vars -mb2_colored_plots -cosmx_mousebrain_transfer_summary <- data.frame( - comparison=c( - "MouseBrain1 native vs MouseBrain2-trained", - "MouseBrain2 native vs MouseBrain1-trained" - ), - spearman=c(cor_mb1_spearman, cor_mb2_spearman), - pearson=c(cor_mb1_pearson, cor_mb2_pearson), - stringsAsFactors=FALSE -) - -cosmx_mousebrain_transfer_summary - -write.csv( - cosmx_mousebrain_transfer_summary, - file=file.path(out_dir, "cosmx_mousebrain1_mousebrain2_transfer_summary.csv"), - row.names=FALSE -) - -metadata(specosm_mb1)$QCScore_model$model_matrix_colnames -metadata(specosm_mb2)$QCScore_model$model_matrix_colnames - -#################### - -spexen_bc1 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast1_Janesick/") -spexen_bc2 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast2_Janesick") - - -spexen_bc1 <- spatialPerCellQC(spexen_bc1) -spexen_bc2 <- spatialPerCellQC(spexen_bc2) - -xenium_formula <- paste0( - "~(", - "log2SignalDensity + ", - "Area_um + ", - "log2AspectRatio + ", - "log2Ctrl_total_ratio", - ")^2" -) - -spexen_bc1 <- computeQCScore( - spe=spexen_bc1, - modelFormula=xenium_formula, - verbose=TRUE -) - -spexen_bc2 <- computeQCScore( - spe=spexen_bc2, - modelFormula=xenium_formula, - verbose=TRUE -) - -spexen_bc1$QC_score_bc1_native <- spexen_bc1$QC_score -spexen_bc2$QC_score_bc2_native <- spexen_bc2$QC_score - -bc1_model <- metadata(spexen_bc1)$QCScore_model -bc2_model <- metadata(spexen_bc2)$QCScore_model - -spexen_bc1 <- applyQCScoreModel( - spe=spexen_bc1, - qcModel=bc2_model, - scoreName="QC_score_bc2_model" -) - -spexen_bc2 <- applyQCScoreModel( - spe=spexen_bc2, - qcModel=bc1_model, - scoreName="QC_score_bc1_model" -) - -cor_bc1_spearman <- cor( - spexen_bc1$QC_score_bc1_native, - spexen_bc1$QC_score_bc2_model, - use="complete.obs", - method="spearman" -) - -cor_bc1_pearson <- cor( - spexen_bc1$QC_score_bc1_native, - spexen_bc1$QC_score_bc2_model, - use="complete.obs", - method="pearson" -) - -cor_bc2_spearman <- cor( - spexen_bc2$QC_score_bc2_native, - spexen_bc2$QC_score_bc1_model, - use="complete.obs", - method="spearman" -) - -cor_bc2_pearson <- cor( - spexen_bc2$QC_score_bc2_native, - spexen_bc2$QC_score_bc1_model, - use="complete.obs", - method="pearson" -) - -res_bc1 <- plot_qc_score_comparison( - x=spexen_bc1$QC_score_bc1_native, - y=spexen_bc1$QC_score_bc2_model, - x_label="Breast Cancer 1 native QS", - y_label="Breast Cancer 2-trained QS", - title="Xenium Breast Cancer 1: native QS vs Breast Cancer 2-trained QS", - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - "xenium_breast1_native_vs_breast2_model_A4_landscape.pdf" - ) -) - -res_bc2 <- plot_qc_score_comparison( - x=spexen_bc2$QC_score_bc2_native, - y=spexen_bc2$QC_score_bc1_model, - x_label="Breast Cancer 2 native QS", - y_label="Breast Cancer 1-trained QS", - title="Xenium Breast Cancer 2: native QS vs Breast Cancer 1-trained QS", - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - "xenium_breast2_native_vs_breast1_model_A4_landscape.pdf" - ) -) - -res_bc1$plot -res_bc2$plot - -color_vars <- c( - "log2SignalDensity", - "Area_um", - "log2AspectRatio", - "log2Ctrl_total_ratio" -) - -bc1_colored_plots <- lapply(color_vars, function(v) { - plot_qc_score_coldata( - x=spexen_bc1$QC_score_bc1_native, - y=spexen_bc1$QC_score_bc2_model, - spe=spexen_bc1, - color_col=v, - x_label="Breast Cancer 1 native QS", - y_label="Breast Cancer 2-trained QS", - title=paste0("Breast Cancer 1 native vs Breast Cancer 2-trained QS: ", v), - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - paste0("xenium_breast1_native_vs_breast2_model_", v, ".pdf") - ) - ) -}) -names(bc1_colored_plots) <- color_vars - -bc2_colored_plots <- lapply(color_vars, function(v) { - plot_qc_score_coldata( - x=spexen_bc2$QC_score_bc2_native, - y=spexen_bc2$QC_score_bc1_model, - spe=spexen_bc2, - color_col=v, - x_label="Breast Cancer 2 native QS", - y_label="Breast Cancer 1-trained QS", - title=paste0("Breast Cancer 2 native vs Breast Cancer 1-trained QS: ", v), - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - paste0("xenium_breast2_native_vs_breast1_model_", v, ".pdf") - ) - ) -}) -names(bc2_colored_plots) <- color_vars - -xenium_breast_transfer_summary <- data.frame( - comparison=c( - "Breast Cancer 1 native vs Breast Cancer 2-trained", - "Breast Cancer 2 native vs Breast Cancer 1-trained" - ), - spearman=c(cor_bc1_spearman, cor_bc2_spearman), - pearson=c(cor_bc1_pearson, cor_bc2_pearson), - stringsAsFactors=FALSE -) - -xenium_breast_transfer_summary - -write.csv( - xenium_breast_transfer_summary, - file=file.path(out_dir, "xenium_breast1_breast2_transfer_summary.csv"), - row.names=FALSE -) - -metadata(spexen_bc1)$QCScore_model$model_matrix_colnames -metadata(spexen_bc2)$QCScore_model$model_matrix_colnames - -######################## diff --git a/inst/scripts/transfer_coeffs_cosmx_xenium.R b/inst/scripts/transfer_coeffs_cosmx_xenium.R deleted file mode 100644 index ae260f5..0000000 --- a/inst/scripts/transfer_coeffs_cosmx_xenium.R +++ /dev/null @@ -1,200 +0,0 @@ -library(SpaceTrooper) -# library(SummarizedExperiment) -# library(S4Vectors) - -set.seed(1998) - -common_formula <- "~(log2SignalDensity + Area_um + log2AspectRatio)^2" - -## ----------------------------- -## Read Xenium data -## ----------------------------- - -dbkx_path <- "~/Downloads/Xenium_data/db_kero_xen" - -spexen <- readXeniumSPE(dbkx_path) - -spexen <- spatialPerCellQC(spexen) - -spexen <- computeQCScore( - spexen, - modelFormula=common_formula, - verbose=TRUE -) - -sum(is.na(spexen$QC_score)) -colSums(is.na(as.data.frame(colData(spexen))[ - , c("log2SignalDensity", "Area_um", "log2AspectRatio") -])) - -length(spexen$QC_score) == ncol(spexen) - -sum(is.na(spexen$QC_score)) -summary(spexen$QC_score) - -metadata(spexen)$QCScore_model$model_formula -metadata(spexen)$QCScore_model$model_matrix_colnames -metadata(spexen)$QCScore_model$coefficients_table - - -## ----------------------------- -## Read CosMx data -## ----------------------------- - -cosm_path <- "~/Downloads/CosMx_data/DBKero/CosMx_Breast/CosMx_data_Case2/" - -specosm <- readCosmxSPE(cosm_path) - -specosm <- spatialPerCellQC(specosm) - -specosm <- computeQCScore( - specosm, - modelFormula=common_formula, - verbose=TRUE -) - -length(specosm$QC_score) == ncol(specosm) - -sum(is.na(specosm$QC_score)) -summary(specosm$QC_score) - -metadata(specosm)$QCScore_model$model_formula -metadata(specosm)$QCScore_model$model_matrix_colnames -metadata(specosm)$QCScore_model$coefficients_table - -## ----------------------------- -## Save native scores before transfer -## ----------------------------- - -spexen$QC_score_xenium_native <- spexen$QC_score -specosm$QC_score_cosmx_native <- specosm$QC_score - -xen_model <- metadata(spexen)$QCScore_model -cosmx_model <- metadata(specosm)$QCScore_model - -## ----------------------------- -## Apply transferred models -## ----------------------------- - -specosm <- applyQCScoreModel( - spe=specosm, - qcModel=xen_model, - scoreName="QC_score_xenium_model" -) - -spexen <- applyQCScoreModel( - spe=spexen, - qcModel=cosmx_model, - scoreName="QC_score_cosmx_model" -) - - -## ----------------------------- -## Summaries -## ----------------------------- - -message("Xenium native score:") -print(summary(spexen$QC_score_xenium_native)) - -message("Xenium scored with CosMx model:") -print(summary(spexen$QC_score_cosmx_model)) - -message("CosMx native score:") -print(summary(specosm$QC_score_cosmx_native)) - -message("CosMx scored with Xenium model:") -print(summary(specosm$QC_score_xenium_model)) - - -## ----------------------------- -## Correlations -## ----------------------------- - -message("Correlation in Xenium: native vs CosMx model") -print( - cor( - spexen$QC_score_xenium_native, - spexen$QC_score_cosmx_model, - use="complete.obs" - ) -) - -message("Correlation in CosMx: native vs Xenium model") -print( - cor( - specosm$QC_score_cosmx_native, - specosm$QC_score_xenium_model, - use="complete.obs" - ) -) - - -## ----------------------------- -## Model details -## ----------------------------- - -message("Xenium model formula:") -print(metadata(spexen)$QCScore_model$model_formula) - -message("CosMx model formula:") -print(metadata(specosm)$QCScore_model$model_formula) - -message("Xenium model coefficients:") -print(metadata(spexen)$QCScore_model$coefficients_table) - -message("CosMx model coefficients:") -print(metadata(specosm)$QCScore_model$coefficients_table) - - -## ----------------------------- -## Correlations of scores with model variables -## ----------------------------- -cor( - spexen$QC_score_xenium_native, - spexen$QC_score_cosmx_model, - use="complete.obs", - method="spearman" -) - -cor( - specosm$QC_score_cosmx_native, - specosm$QC_score_xenium_model, - use="complete.obs", - method="spearman" -) - -range(spexen$QC_score_cosmx_model, na.rm=TRUE) -sum(is.na(spexen$QC_score_cosmx_model)) - -range(specosm$QC_score_xenium_model, na.rm=TRUE) -sum(is.na(specosm$QC_score_xenium_model)) - -res_xen <- plot_qc_score_comparison( - x=spexen$QC_score_xenium_native, - y=spexen$QC_score_cosmx_model, - x_label="Native Xenium QC score", - y_label="CosMx-trained QC score", - title="Xenium: native vs CosMx-trained QC score", - cor_method = "pearson", - threshold=0.25, - output_file="~/SpaceTrooper_qs_check/xenium_native_vs_cosmx_model.pdf" -) - -res_xen$plot -res_xen$correlation -res_xen$quadrant_table - -res_cosmx <- plot_qc_score_comparison( - x=specosm$QC_score_cosmx_native, - y=specosm$QC_score_xenium_model, - x_label="Native CosMx QC score", - y_label="Xenium-trained QC score", - title="CosMx: native vs Xenium-trained QC score", - cor_method = "pearson", - threshold=0.75, - output_file="~/SpaceTrooper_qs_check/cosmx_native_vs_xenium_model.pdf" -) - -res_cosmx$plot -res_cosmx$correlation -res_cosmx$quadrant_table diff --git a/inst/scripts/transfer_coeffs_xenium_xenium.R b/inst/scripts/transfer_coeffs_xenium_xenium.R deleted file mode 100644 index 7ed919e..0000000 --- a/inst/scripts/transfer_coeffs_xenium_xenium.R +++ /dev/null @@ -1,348 +0,0 @@ -spexen_bc1 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast1_Janesick/") -spexen_bc2 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast2_Janesick") - - -spexen_bc1 <- spatialPerCellQC(spexen_bc1) -spexen_bc2 <- spatialPerCellQC(spexen_bc2) - -xenium_formula <- paste0( - "~(", - "log2SignalDensity + ", - "Area_um + ", - "log2AspectRatio + ", - "log2Ctrl_total_ratio", - ")^2" -) - -spexen_bc1 <- computeQCScore( - spe=spexen_bc1, - modelFormula=xenium_formula, - verbose=TRUE -) - -spexen_bc2 <- computeQCScore( - spe=spexen_bc2, - modelFormula=xenium_formula, - verbose=TRUE -) - -spexen_bc1$QC_score_bc1_native <- spexen_bc1$QC_score -spexen_bc2$QC_score_bc2_native <- spexen_bc2$QC_score - -bc1_model <- metadata(spexen_bc1)$QCScore_model -bc2_model <- metadata(spexen_bc2)$QCScore_model - -spexen_bc1 <- applyQCScoreModel( - spe=spexen_bc1, - qcModel=bc2_model, - scoreName="QC_score_bc2_model" -) - -spexen_bc2 <- applyQCScoreModel( - spe=spexen_bc2, - qcModel=bc1_model, - scoreName="QC_score_bc1_model" -) - -cor_bc1_spearman <- cor( - spexen_bc1$QC_score_bc1_native, - spexen_bc1$QC_score_bc2_model, - use="complete.obs", - method="spearman" -) - -cor_bc1_pearson <- cor( - spexen_bc1$QC_score_bc1_native, - spexen_bc1$QC_score_bc2_model, - use="complete.obs", - method="pearson" -) - -cor_bc2_spearman <- cor( - spexen_bc2$QC_score_bc2_native, - spexen_bc2$QC_score_bc1_model, - use="complete.obs", - method="spearman" -) - -cor_bc2_pearson <- cor( - spexen_bc2$QC_score_bc2_native, - spexen_bc2$QC_score_bc1_model, - use="complete.obs", - method="pearson" -) - -res_bc1 <- plot_qc_score_comparison( - x=spexen_bc1$QC_score_bc1_native, - y=spexen_bc1$QC_score_bc2_model, - x_label="Breast Cancer 1 native QS", - y_label="Breast Cancer 2-trained QS", - title="Xenium Breast Cancer 1: native QS vs Breast Cancer 2-trained QS", - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - "xenium_breast1_native_vs_breast2_model_A4_landscape.pdf" - ) -) - -res_bc2 <- plot_qc_score_comparison( - x=spexen_bc2$QC_score_bc2_native, - y=spexen_bc2$QC_score_bc1_model, - x_label="Breast Cancer 2 native QS", - y_label="Breast Cancer 1-trained QS", - title="Xenium Breast Cancer 2: native QS vs Breast Cancer 1-trained QS", - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - "xenium_breast2_native_vs_breast1_model_A4_landscape.pdf" - ) -) - -res_bc1$plot -res_bc2$plot - -color_vars <- c( - "log2SignalDensity", - "Area_um", - "log2AspectRatio", - "log2Ctrl_total_ratio" -) - -bc1_colored_plots <- lapply(color_vars, function(v) { - plot_qc_score_coldata( - x=spexen_bc1$QC_score_bc1_native, - y=spexen_bc1$QC_score_bc2_model, - spe=spexen_bc1, - color_col=v, - x_label="Breast Cancer 1 native QS", - y_label="Breast Cancer 2-trained QS", - title=paste0("Breast Cancer 1 native vs Breast Cancer 2-trained QS: ", v), - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - paste0("xenium_breast1_native_vs_breast2_model_", v, ".pdf") - ) - ) -}) -names(bc1_colored_plots) <- color_vars - -bc2_colored_plots <- lapply(color_vars, function(v) { - plot_qc_score_coldata( - x=spexen_bc2$QC_score_bc2_native, - y=spexen_bc2$QC_score_bc1_model, - spe=spexen_bc2, - color_col=v, - x_label="Breast Cancer 2 native QS", - y_label="Breast Cancer 1-trained QS", - title=paste0("Breast Cancer 2 native vs Breast Cancer 1-trained QS: ", v), - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - paste0("xenium_breast2_native_vs_breast1_model_", v, ".pdf") - ) - ) -}) -names(bc2_colored_plots) <- color_vars - -xenium_breast_transfer_summary <- data.frame( - comparison=c( - "Breast Cancer 1 native vs Breast Cancer 2-trained", - "Breast Cancer 2 native vs Breast Cancer 1-trained" - ), - spearman=c(cor_bc1_spearman, cor_bc2_spearman), - pearson=c(cor_bc1_pearson, cor_bc2_pearson), - stringsAsFactors=FALSE -) - -xenium_breast_transfer_summary - -write.csv( - xenium_breast_transfer_summary, - file=file.path(out_dir, "xenium_breast1_breast2_transfer_summary.csv"), - row.names=FALSE -) - -metadata(spexen_bc1)$QCScore_model$model_matrix_colnames -metadata(spexen_bc2)$QCScore_model$model_matrix_colnames - -######################## - -dbkx_path <- "~/Downloads/Xenium_data/db_kero_xen" - -spexen <- readXeniumSPE(dbkx_path) -spexen_bc1 <- readXeniumSPE("/Users/inzirio/Downloads/Xenium_data/Xenium_HumanBreast1_Janesick/") - - - -spexen <- spatialPerCellQC(spexen) -spexen_bc1 <- spatialPerCellQC(spexen_bc1) - -xenium_formula <- paste0( - "~(", - "log2SignalDensity + ", - "Area_um + ", - "log2AspectRatio + ", - "log2Ctrl_total_ratio", - ")^2" -) - -spexen <- computeQCScore( - spe=spexen, - modelFormula=xenium_formula, - verbose=TRUE -) - -spexen_bc1 <- computeQCScore( - spe=spexen_bc1, - modelFormula=xenium_formula, - verbose=TRUE -) - -spexen$QC_score_dbk_native <- spexen$QC_score -spexen_bc1$QC_score_bc1_native <- spexen_bc1$QC_score - -dbk_model <- metadata(spexen)$QCScore_model -bc1_model <- metadata(spexen_bc1)$QCScore_model - -spexen <- applyQCScoreModel( - spe=spexen, - qcModel=bc1_model, - scoreName="QC_score_bc1_model" -) - -spexen_bc1 <- applyQCScoreModel( - spe=spexen_bc1, - qcModel=dbk_model, - scoreName="QC_score_dbk_model" -) - -cor_dbk_spearman <- cor( - spexen$QC_score_dbk_native, - spexen$QC_score_bc1_model, - use="complete.obs", - method="spearman" -) - -cor_dbk_pearson <- cor( - spexen$QC_score_dbk_native, - spexen$QC_score_bc1_model, - use="complete.obs", - method="pearson" -) - -cor_bc1_spearman <- cor( - spexen_bc1$QC_score_bc1_native, - spexen_bc1$QC_score_dbk_model, - use="complete.obs", - method="spearman" -) - -cor_bc1_pearson <- cor( - spexen_bc1$QC_score_bc1_native, - spexen_bc1$QC_score_dbk_model, - use="complete.obs", - method="pearson" -) - -res_dbk <- plot_qc_score_comparison( - x=spexen$QC_score_dbk_native, - y=spexen$QC_score_bc1_model, - x_label="DBK Xenium native QS", - y_label="Breast 1-trained QS", - title="Xenium DBK: native QS vs Breast 1-trained QS", - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - "xenium_dbk_native_vs_breast1_model_A4_landscape.pdf" - ) -) - -res_bc1 <- plot_qc_score_comparison( - x=spexen_bc1$QC_score_bc1_native, - y=spexen_bc1$QC_score_dbk_model, - x_label="Breast 1 native QS", - y_label="DBK-trained QS", - title="Xenium Breast 1: native QS vs DBK-trained QS", - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - "xenium_breast1_native_vs_dbk_model_A4_landscape.pdf" - ) -) - -res_dbk$plot -res_bc1$plot - -color_vars <- c( - "log2SignalDensity", - "Area_um", - "log2AspectRatio", - "log2Ctrl_total_ratio" -) - -dbk_colored_plots <- lapply(color_vars, function(v) { - plot_qc_score_coldata( - x=spexen$QC_score_dbk_native, - y=spexen$QC_score_bc1_model, - spe=spexen, - color_col=v, - x_label="DBK Xenium native QS", - y_label="Breast 1-trained QS", - title=paste0("DBK Xenium native vs Breast 1-trained QS: ", v), - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - paste0("xenium_dbk_native_vs_breast1_model_", v, ".pdf") - ) - ) -}) -names(dbk_colored_plots) <- color_vars - -bc1_colored_plots <- lapply(color_vars, function(v) { - plot_qc_score_coldata( - x=spexen_bc1$QC_score_bc1_native, - y=spexen_bc1$QC_score_dbk_model, - spe=spexen_bc1, - color_col=v, - x_label="Breast 1 native QS", - y_label="DBK-trained QS", - title=paste0("Breast 1 native vs DBK-trained QS: ", v), - threshold=0.75, - cor_method="spearman", - output_file=file.path( - out_dir, - paste0("xenium_breast1_native_vs_dbk_model_", v, ".pdf") - ) - ) -}) -names(bc1_colored_plots) <- color_vars - -xenium_dbk_breast1_transfer_summary <- data.frame( - comparison=c( - "DBK Xenium native vs Breast 1-trained", - "Breast 1 native vs DBK-trained" - ), - spearman=c(cor_dbk_spearman, cor_bc1_spearman), - pearson=c(cor_dbk_pearson, cor_bc1_pearson), - stringsAsFactors=FALSE -) - -xenium_dbk_breast1_transfer_summary - -write.csv( - xenium_dbk_breast1_transfer_summary, - file=file.path(out_dir, "xenium_dbk_breast1_transfer_summary.csv"), - row.names=FALSE -) - -metadata(spexen)$QCScore_model$model_matrix_colnames -metadata(spexen_bc1)$QCScore_model$model_matrix_colnames - -######################### diff --git a/inst/scripts/transfer_qs_coefficients.R b/inst/scripts/transfer_qs_coefficients.R deleted file mode 100644 index 27b4c3a..0000000 --- a/inst/scripts/transfer_qs_coefficients.R +++ /dev/null @@ -1,665 +0,0 @@ -library(SpaceTrooper) -library(SummarizedExperiment) -library(S4Vectors) - -# Read a dataset with the appropriate SpaceTrooper reader according to the -# selected technology. -# -# Supported values are: -# - "cosmx" -# - "cosmx_protein" -# - "xenium" -read_dataset_for_transfer <- function( - dir_name, - technology=c("cosmx", "cosmx_protein", "xenium"), - sample_name="sample01" -) { - technology <- match.arg(technology) - - switch( - technology, - cosmx=readCosmxSPE(dir_name, sampleName=sample_name), - cosmx_protein=readCosmxProteinSPE(dir_name, sampleName=sample_name), - xenium=readXeniumSPE( - dir_name, - sampleName=sample_name, - computeMissingMetrics=TRUE, - keepPolygons=FALSE - ) - ) -} - -# Compute the per-cell QC metrics required before training or applying QS. -# -# The same preprocessing is applied to both source and target datasets so that -# transferred coefficients operate on harmonised metric columns. -prepare_dataset_for_transfer <- function( - spe, - rm_zeros=TRUE, - neg_prob_list=c("NegPrb", "Negative", "SystemControl") -) { - spatialPerCellQC( - spe, - rmZeros=rm_zeros, - negProbList=neg_prob_list - ) -} - -# Identify the metric set that can be transferred from source to target. -# -# For heterogeneous transfers, the safest common metrics are: -# - log2SignalDensity -# - Area_um -# - log2Ctrl_total_ratio -# -# The border-dependent aspect-ratio term is included only if both datasets have -# `log2AspectRatio` and `dist_border`, which is typically the case for CosMx to -# CosMx transfers. -get_transferable_metrics <- function(source_spe, target_spe) { - source_cols <- colnames(SummarizedExperiment::colData(source_spe)) - target_cols <- colnames(SummarizedExperiment::colData(target_spe)) - shared_cols <- intersect(source_cols, target_cols) - - metrics <- intersect( - c("log2SignalDensity", "Area_um", "log2Ctrl_total_ratio"), - shared_cols - ) - - if (all(c("log2AspectRatio", "dist_border") %in% source_cols) && - all(c("log2AspectRatio", "dist_border") %in% target_cols)) { - metrics <- c(metrics, "log2AspectRatio") - } - - unique(metrics) -} - -# Extract a readable coefficient table from a fitted glmnet model. -# -# Coefficients are returned at the selected lambda, so they are exactly the ones -# transferred to the target dataset. -extract_qs_coefficients <- function(fit, lambda) { - coef_mat <- as.matrix(stats::predict(fit, s=lambda, type="coefficients")) - data.frame( - term=rownames(coef_mat), - coefficient=as.numeric(coef_mat[, 1]), - row.names=NULL - ) -} - -# Apply a fitted QS model to a new SpatialExperiment object. -# -# The design matrix is rebuilt from the target `colData` using the compatible -# transfer formula trained on the source dataset. -score_target_dataset <- function(spe, fit, lambda, model_formula) { - df <- as.data.frame(SummarizedExperiment::colData(spe)) - x <- stats::model.matrix(stats::as.formula(model_formula), data=df) - spe$QC_score <- as.vector( - stats::predict(fit, s=lambda, newx=x, type="response") - ) - spe -} - -# Compute AUROC with a rank-based formula, avoiding extra package dependencies. -auc_rank <- function(score, label_positive) { - label_positive <- as.integer(label_positive) - n_pos <- sum(label_positive == 1L) - n_neg <- sum(label_positive == 0L) - - if (n_pos == 0L || n_neg == 0L) { - return(NA_real_) - } - - ranks <- rank(score, ties.method="average") - (sum(ranks[label_positive == 1L]) - n_pos * (n_pos + 1) / 2) / - (n_pos * n_neg) -} - -# Build pseudo-reference labels on a dataset using the same metric subset used -# for transfer. -# -# This is useful for evaluating how well transferred coefficients separate -# pseudo-bad from pseudo-good cells on the target dataset. -build_reference_labels_transfer <- function(spe, metric_list, verbose=FALSE) { - spe_ref <- computeOutliersQCScore(spe, metricList=metric_list) - spe_ref <- checkOutliers(spe_ref, verbose=verbose) - - df_ref <- computeTrainDF( - colData=SummarizedExperiment::colData(spe_ref), - formulaVars=S4Vectors::metadata(spe_ref)$formula_variables, - tech=S4Vectors::metadata(spe_ref)$technology, - verbose=verbose - ) - - df_ref[, c("cell_id", "qcscore_train")] -} - -# Safe wrapper for pseudo-reference label generation. -safe_reference_labels_transfer <- function(spe, metric_list, verbose=FALSE) { - tryCatch( - list( - data=build_reference_labels_transfer( - spe, - metric_list=metric_list, - verbose=verbose - ), - error=NULL - ), - error=function(e) list(data=NULL, error=conditionMessage(e)) - ) -} - -# Summarise the separation between pseudo-bad and pseudo-good cells. -# -# The summary is computed on either source or target after scoring with the -# transferred model. -summarise_transfer_eval <- function(df_eval, threshold=0.5) { - if (is.null(df_eval) || nrow(df_eval) == 0L) { - return(data.frame()) - } - - is_bad <- df_eval$qcscore_train == 0 - is_good <- df_eval$qcscore_train == 1 - pred_bad <- df_eval$QC_score < threshold - - data.frame( - n_labeled=nrow(df_eval), - n_bad=sum(is_bad), - n_good=sum(is_good), - median_qs_bad=if (sum(is_bad) > 0) { - stats::median(df_eval$QC_score[is_bad]) - } else { - NA_real_ - }, - median_qs_good=if (sum(is_good) > 0) { - stats::median(df_eval$QC_score[is_good]) - } else { - NA_real_ - }, - auc_bad_vs_good=auc_rank(1 - df_eval$QC_score, as.integer(is_bad)), - sensitivity_bad_qs_lt_threshold=if (sum(is_bad) > 0) { - mean(pred_bad[is_bad]) - } else { - NA_real_ - }, - specificity_good_qs_ge_threshold=if (sum(is_good) > 0) { - mean(!pred_bad[is_good]) - } else { - NA_real_ - } - ) -} - -# Add good/bad/unlabelled labels to a SpatialExperiment object for plotting. -# -# Labels are derived from the evaluation table built for either the source or -# the target dataset after scoring with transferred coefficients. -append_transfer_labels <- function( - spe, - df_eval, - label_col="transfer_label", - colour_col="transfer_label_color" -) { - labels <- rep("unlabelled", ncol(spe)) - - if (!is.null(df_eval) && nrow(df_eval) > 0L) { - idx <- match(spe$cell_id, df_eval$cell_id) - keep <- !is.na(idx) - labels[keep] <- ifelse(df_eval$qcscore_train[idx[keep]] == 0, - "bad", "good") - } - - spe[[label_col]] <- factor(labels, levels=c("bad", "good", "unlabelled")) - palette <- c(bad="#D55E00", good="#009E73", unlabelled="#BDBDBD") - spe[[colour_col]] <- unname(palette[as.character(spe[[label_col]])]) - spe -} - -# Build a ROC data.frame from pseudo-reference labels and transferred QS. -build_transfer_roc_df <- function(df_eval) { - if (is.null(df_eval) || nrow(df_eval) == 0L) { - return(data.frame()) - } - - label_positive <- as.integer(df_eval$qcscore_train == 0) - n_pos <- sum(label_positive == 1L) - n_neg <- sum(label_positive == 0L) - - if (n_pos == 0L || n_neg == 0L) { - return(data.frame()) - } - - score_bad <- 1 - df_eval$QC_score - ord <- order(score_bad, decreasing=TRUE) - score_bad <- score_bad[ord] - label_positive <- label_positive[ord] - - tp <- cumsum(label_positive == 1L) - fp <- cumsum(label_positive == 0L) - change <- c(score_bad[-1] != score_bad[-length(score_bad)], TRUE) - - data.frame( - fpr=c(0, fp[change] / n_neg), - tpr=c(0, tp[change] / n_pos) - ) -} - -# Plot source or target cells coloured as pseudo-good / pseudo-bad after -# scoring with transferred coefficients. -plot_transfer_labelled_cells <- function( - result, - dataset=c("source", "target"), - size=0.05, - alpha=0.8, - scaleBar=TRUE -) { - dataset <- match.arg(dataset) - - spe <- switch(dataset, - source=result$source_spe, - target=result$target_spe - ) - df_eval <- switch(dataset, - source=result$source_eval, - target=result$target_eval - ) - - spe <- append_transfer_labels(spe, df_eval) - - plotCentroids( - spe, - colourBy="transfer_label", - palette="transfer_label_color", - size=size, - alpha=alpha, - scaleBar=scaleBar - ) + - ggplot2::labs( - title=paste0("Transferred QS labels on ", dataset, " dataset"), - colour="Reference label", - fill="Reference label" - ) -} - -# Plot the spatial distribution of transferred QS on source or target. -plot_transfer_qs <- function( - result, - dataset=c("source", "target"), - size=0.05, - alpha=0.8, - scaleBar=TRUE -) { - dataset <- match.arg(dataset) - - spe <- switch(dataset, - source=result$source_spe, - target=result$target_spe - ) - - plotCentroids( - spe, - colourBy="QC_score", - size=size, - alpha=alpha, - scaleBar=scaleBar - ) + - ggplot2::labs(title=paste0("Transferred QC score on ", dataset, " dataset")) -} - -# Plot ROC curves for the source or target transfer evaluation. -plot_transfer_roc <- function(result, dataset=c("source", "target")) { - dataset <- match.arg(dataset) - - df_eval <- switch(dataset, - source=result$source_eval, - target=result$target_eval - ) - roc_df <- build_transfer_roc_df(df_eval) - - if (nrow(roc_df) == 0L) { - stop("No ROC curve available for the selected dataset") - } - - auc_value <- switch(dataset, - source=result$source_summary$auc_bad_vs_good, - target=result$target_summary$auc_bad_vs_good - ) - - ggplot2::ggplot(roc_df, ggplot2::aes(x=fpr, y=tpr)) + - ggplot2::geom_abline(intercept=0, slope=1, linetype=2, - colour="grey60") + - ggplot2::geom_path(linewidth=0.9, colour="#0072B2") + - ggplot2::coord_fixed() + - ggplot2::theme_bw() + - ggplot2::labs( - title=paste0("ROC curve for transferred QS (", dataset, ")"), - subtitle=paste0("AUC = ", round(auc_value, 4)), - x="False positive rate", - y="True positive rate" - ) -} - -# Compare AUC between source and target after transfer. -plot_transfer_auc_summary <- function(result) { - auc_df <- data.frame( - dataset=c("source", "target"), - auc_bad_vs_good=c( - result$source_summary$auc_bad_vs_good, - if (nrow(result$target_summary) == 0L) NA_real_ else result$target_summary$auc_bad_vs_good - ) - ) - - ggplot2::ggplot(auc_df, ggplot2::aes(x=dataset, y=auc_bad_vs_good, fill=dataset)) + - ggplot2::geom_col(width=0.65, alpha=0.85, show.legend=FALSE) + - ggplot2::geom_text(ggplot2::aes(label=round(auc_bad_vs_good, 4)), - vjust=-0.4, na.rm=TRUE) + - ggplot2::theme_bw() + - ggplot2::labs( - title="AUC summary for transferred QS", - x=NULL, - y="AUC" - ) -} - -# Train a native QS model directly on the target dataset, so it can be compared -# against the transferred QS. -# -# By default it uses the full SpaceTrooper native workflow on the target. -# If `use_transfer_metrics = TRUE`, it restricts the native training to the same -# metric subset used by the transferred model. -compute_native_target_qs <- function( - result, - use_transfer_metrics=FALSE, - verbose=TRUE -) { - target_native <- result$target_spe - target_native$QC_score_transfer <- target_native$QC_score - - if (isTRUE(use_transfer_metrics)) { - target_native <- computeOutliersQCScore( - target_native, - metricList=result$transfer_metrics - ) - target_native <- checkOutliers(target_native, verbose=verbose) - - formula_vars <- S4Vectors::metadata(target_native)$formula_variables - model_formula <- getModelFormula(formula_vars, verbose=verbose) - train_df <- computeTrainDF( - colData=SummarizedExperiment::colData(target_native), - formulaVars=formula_vars, - tech=S4Vectors::metadata(target_native)$technology, - verbose=verbose - ) - x_target <- stats::model.matrix(stats::as.formula(model_formula), data=train_df) - best_lambda <- computeLambda(train_df, model_formula) - fit_native <- trainModel(x_target, train_df) - target_native$QC_score_native <- as.vector( - stats::predict( - fit_native, - s=best_lambda, - newx=stats::model.matrix( - stats::as.formula(model_formula), - data=as.data.frame(SummarizedExperiment::colData(target_native)) - ), - type="response" - ) - ) - return(target_native) - } - - target_native <- computeQCScore(target_native, verbose=verbose) - target_native$QC_score_native <- target_native$QC_score - target_native$QC_score <- target_native$QC_score_transfer - target_native -} - -# Compare transferred QS with a native QS trained directly on the target. -# -# The function reports correlations and agreement on low-quality calls. -compare_transferred_vs_native <- function( - result, - threshold=0.5, - low_fraction=0.1, - use_transfer_metrics=FALSE, - verbose=TRUE -) { - stopifnot(length(low_fraction) == 1L, low_fraction > 0, low_fraction < 1) - - target_native <- compute_native_target_qs( - result, - use_transfer_metrics=use_transfer_metrics, - verbose=verbose - ) - - df_compare <- data.frame( - cell_id=target_native$cell_id, - qc_transfer=target_native$QC_score_transfer, - qc_native=target_native$QC_score_native - ) - - transfer_cut <- stats::quantile(df_compare$qc_transfer, probs=low_fraction, na.rm=TRUE) - native_cut <- stats::quantile(df_compare$qc_native, probs=low_fraction, na.rm=TRUE) - - comparison_summary <- data.frame( - pearson=stats::cor(df_compare$qc_transfer, df_compare$qc_native, - method="pearson", use="complete.obs"), - spearman=stats::cor(df_compare$qc_transfer, df_compare$qc_native, - method="spearman", use="complete.obs"), - low_qc_agreement_at_threshold=mean( - (df_compare$qc_transfer < threshold) == (df_compare$qc_native < threshold), - na.rm=TRUE - ), - overlap_lowest_fraction=mean( - (df_compare$qc_transfer <= transfer_cut) & - (df_compare$qc_native <= native_cut), - na.rm=TRUE - ) / low_fraction - ) - - list( - summary=comparison_summary, - df_compare=df_compare, - target_native_spe=target_native, - threshold=threshold, - low_fraction=low_fraction, - use_transfer_metrics=use_transfer_metrics - ) -} - -# Scatter plot of transferred QS vs native QS on the target dataset. -plot_transfer_vs_native_scatter <- function(comparison_result) { - ggplot2::ggplot( - comparison_result$df_compare, - ggplot2::aes(x=qc_native, y=qc_transfer) - ) + - ggplot2::geom_point(alpha=0.4, size=0.6, colour="#0072B2") + - ggplot2::geom_abline(intercept=0, slope=1, linetype=2, - colour="grey60") + - ggplot2::theme_bw() + - ggplot2::labs( - title="Transferred QS vs native QS on target", - subtitle=paste0( - "Spearman = ", - round(comparison_result$summary$spearman, 4), - ", agreement@", - comparison_result$threshold, - " = ", - round(comparison_result$summary$low_qc_agreement_at_threshold, 4) - ), - x="Native target QS", - y="Transferred QS" - ) -} - -# Train a transferable QS model on one dataset and apply it to another. -# -# Workflow: -# 1. read source and target datasets -# 2. compute QC metrics on both -# 3. identify the metric subset that exists in both datasets -# 4. train the ridge-logistic QS model on the source dataset using only those metrics -# 5. transfer the fitted coefficients to the target dataset -# 6. optionally evaluate the transfer on pseudo-reference labels in the target -transfer_qs_coefficients <- function( - source_dir, - target_dir, - source_technology=c("cosmx", "cosmx_protein", "xenium"), - target_technology=c("cosmx", "cosmx_protein", "xenium"), - source_sample_name="source_sample", - target_sample_name="target_sample", - threshold=0.5, - evaluate_target=TRUE, - verbose=TRUE -) { - source_technology <- match.arg(source_technology) - target_technology <- match.arg(target_technology) - - # Read the source dataset, from which coefficients will be learned. - source_spe <- read_dataset_for_transfer( - dir_name=source_dir, - technology=source_technology, - sample_name=source_sample_name - ) - - # Read the target dataset, on which coefficients will be transferred. - target_spe <- read_dataset_for_transfer( - dir_name=target_dir, - technology=target_technology, - sample_name=target_sample_name - ) - - # Harmonise per-cell QC metrics in both datasets. - source_spe <- prepare_dataset_for_transfer(source_spe) - target_spe <- prepare_dataset_for_transfer(target_spe) - - # Keep only the subset of QS metrics that is available in both datasets. - transfer_metrics <- get_transferable_metrics(source_spe, target_spe) - - if (!"log2SignalDensity" %in% transfer_metrics) { - stop("log2SignalDensity is required for transferable QS training") - } - if (length(transfer_metrics) < 2L) { - stop("At least two shared metrics are recommended for coefficient transfer") - } - - if (verbose) { - message("Transferable metrics: ", paste(transfer_metrics, collapse=", ")) - } - - # Compute source outliers only on the transferable metric subset. - source_spe <- computeOutliersQCScore(source_spe, metricList=transfer_metrics) - source_spe <- checkOutliers(source_spe, verbose=verbose) - - formula_vars <- S4Vectors::metadata(source_spe)$formula_variables - if (!"log2SignalDensity" %in% names(formula_vars)) { - stop("The transferable source formula lost log2SignalDensity after outlier filtering") - } - - # Build the compatible model formula and the balanced source training table. - model_formula <- getModelFormula(formula_vars, verbose=verbose) - source_train_df <- computeTrainDF( - colData=SummarizedExperiment::colData(source_spe), - formulaVars=formula_vars, - tech=S4Vectors::metadata(source_spe)$technology, - verbose=verbose - ) - - # Fit the ridge-logistic QS model on the source dataset. - x_source <- stats::model.matrix(stats::as.formula(model_formula), - data=source_train_df) - best_lambda <- computeLambda(source_train_df, model_formula) - fit <- trainModel(x_source, source_train_df) - coefficient_table <- extract_qs_coefficients(fit, best_lambda) - - # Score the full source and target datasets with the same transferred model. - source_spe <- score_target_dataset(source_spe, fit, best_lambda, model_formula) - target_spe <- score_target_dataset(target_spe, fit, best_lambda, model_formula) - - # Build the labelled source evaluation set from the source training examples. - source_eval <- merge( - data.frame(cell_id=source_spe$cell_id, QC_score=source_spe$QC_score), - source_train_df[, c("cell_id", "qcscore_train")], - by="cell_id" - ) - source_summary <- summarise_transfer_eval(source_eval, threshold=threshold) - - target_eval <- NULL - target_summary <- data.frame() - target_label_error <- NA_character_ - - if (isTRUE(evaluate_target)) { - # Build pseudo-reference labels independently on the target dataset, - # using the same transferable metric subset. - target_ref <- safe_reference_labels_transfer( - target_spe, - metric_list=transfer_metrics, - verbose=verbose - ) - target_label_error <- target_ref$error - - if (!is.null(target_ref$data)) { - target_eval <- merge( - data.frame(cell_id=target_spe$cell_id, QC_score=target_spe$QC_score), - target_ref$data, - by="cell_id" - ) - target_summary <- summarise_transfer_eval(target_eval, - threshold=threshold) - } - } - - list( - source_technology=source_technology, - target_technology=target_technology, - transfer_metrics=transfer_metrics, - model_formula=model_formula, - lambda=best_lambda, - coefficient_table=coefficient_table, - source_summary=source_summary, - target_summary=target_summary, - target_label_error=target_label_error, - source_train_df=source_train_df, - source_eval=source_eval, - target_eval=target_eval, - source_spe=source_spe, - target_spe=target_spe, - fit=fit - ) -} - -# Esempi di utilizzo: -# source("inst/scripts/transfer_qs_coefficients.R") -# -# Train su CosMx e applica a Xenium -# res_transfer_cx <- transfer_qs_coefficients( -# source_dir="/path/to/cosmx_dataset", -# source_technology="cosmx", -# target_dir="/path/to/xenium_dataset", -# target_technology="xenium", -# source_sample_name="CosMx_source", -# target_sample_name="Xenium_target" -# ) -# res_transfer_cx$coefficient_table -# res_transfer_cx$target_summary -# plot_transfer_labelled_cells(res_transfer_cx, dataset="target", size=0.08) -# plot_transfer_qs(res_transfer_cx, dataset="target", size=0.08) -# plot_transfer_roc(res_transfer_cx, dataset="target") -# plot_transfer_auc_summary(res_transfer_cx) -# -# cmp_transfer_cx <- compare_transferred_vs_native( -# res_transfer_cx, -# threshold=0.5, -# low_fraction=0.1 -# ) -# cmp_transfer_cx$summary -# plot_transfer_vs_native_scatter(cmp_transfer_cx) -# -# Train su Xenium e applica a CosMx -# res_transfer_xc <- transfer_qs_coefficients( -# source_dir="/path/to/xenium_dataset", -# source_technology="xenium", -# target_dir="/path/to/cosmx_dataset", -# target_technology="cosmx" -# ) -# res_transfer_xc$coefficient_table -# res_transfer_xc$target_summary -# res_transfer_xc$target_summary From c85f7412439a6124be4173a7bfaa1e0c3ea3c7d7 Mon Sep 17 00:00:00 2001 From: Dario Date: Wed, 10 Jun 2026 16:51:27 +0200 Subject: [PATCH 10/10] improving documentation for QC --- R/QC.R | 17 ++++++++++++++++- man/applyQCScoreModel.Rd | 13 +++++++++++++ man/computeQCScore.Rd | 5 ++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/R/QC.R b/R/QC.R index 78f5798..a265b95 100644 --- a/R/QC.R +++ b/R/QC.R @@ -413,7 +413,7 @@ computeLambda <- function(trainDF, modelFormula) { #' For each couple of variables interaction terms are computed. #' #' Additionally, for CosMx datasets, the distance from the border of the FOV is -#' also included in the formula as a metric to take into account . +#' also included in the formula as a metric to take into account. #' For Xenium and Merscope datasets, QC score cannot depend on FoV border effect, #' as no FOV border effect was captured through this metric. #' @@ -438,6 +438,9 @@ computeLambda <- function(trainDF, modelFormula) { #' Otherwise, an easier way is to let be lambda computed internally, just set #' a seed with `set.seed()` before running `computeQCScore`. #' +#' The computed model output is stored in `metadata(spe)$QCScore_model` +#' for inspection and reuse (see also \code{\link{applyQCScoreModel}}). +#' #' @param spe A `SpatialExperiment` object with spatial transcriptomics data. #' @param verbose logical for having a verbose output. Default is FALSE. #' @param bestLambda the best lambda typically computed using `computeLambda`. @@ -1207,6 +1210,18 @@ checkOutliers <- function(spe, verbose=FALSE) { #' #' @description #' Apply a previously trained QC score model to a new SpatialExperiment object. +#' See details for important considerations when applying a model to a +#' different dataset. +#' @details +#' The authors do not reccommend applying a QC score model trained on one +#' dataset to a different dataset, but this could be useful in some cases, +#' for example to transfer a model trained on a dataset core and the applied to +#' other cores from the same experiment. +#' The QC score model should have been trained on a dataset with the same +#' set of QC metrics and similar data distribution to have better +#' predictions. The function will check for the presence of required model +#' variables in the new dataset and will handle missing or extra variables +#' accordingly. #' #' @param spe A `SpatialExperiment` object with QC metrics already computed. #' @param qcModel A QC score model object, usually stored in diff --git a/man/applyQCScoreModel.Rd b/man/applyQCScoreModel.Rd index 9bea5d2..3e67195 100644 --- a/man/applyQCScoreModel.Rd +++ b/man/applyQCScoreModel.Rd @@ -19,6 +19,19 @@ A `SpatialExperiment` object with added QC score in `colData`. } \description{ Apply a previously trained QC score model to a new SpatialExperiment object. +See details for important considerations when applying a model to a +different dataset. +} +\details{ +The authors do not reccommend applying a QC score model trained on one +dataset to a different dataset, but this could be useful in some cases, +for example to transfer a model trained on a dataset core and the applied to +other cores from the same experiment. +The QC score model should have been trained on a dataset with the same +set of QC metrics and similar data distribution to have better +predictions. The function will check for the presence of required model +variables in the new dataset and will handle missing or extra variables +accordingly. } \examples{ diff --git a/man/computeQCScore.Rd b/man/computeQCScore.Rd index 0af8503..edc0af6 100644 --- a/man/computeQCScore.Rd +++ b/man/computeQCScore.Rd @@ -49,7 +49,7 @@ size is the area for CosMx and Xenium, while it is the volume for Merfish. For each couple of variables interaction terms are computed. Additionally, for CosMx datasets, the distance from the border of the FOV is -also included in the formula as a metric to take into account . +also included in the formula as a metric to take into account. For Xenium and Merscope datasets, QC score cannot depend on FoV border effect, as no FOV border effect was captured through this metric. @@ -73,6 +73,9 @@ previously computed with `computeLambda` preceeded by `computeTrainDF` and This is useful for reproducibility across different runs. Otherwise, an easier way is to let be lambda computed internally, just set a seed with `set.seed()` before running `computeQCScore`. + +The computed model output is stored in `metadata(spe)$QCScore_model` +for inspection and reuse (see also \code{\link{applyQCScoreModel}}). } \examples{ example(spatialPerCellQC)