Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cnvlib/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -2798,6 +2798,7 @@ def _cmd_import_rna(args: argparse.Namespace) -> None:
args.do_gc,
args.do_txlen,
args.max_log2,
diploid_parx_genome=args.diploid_parx_genome,
min_sample_fraction=args.min_sample_fraction,
)
logging.info("Writing output files")
Expand Down Expand Up @@ -2895,6 +2896,7 @@ def _cmd_import_rna(args: argparse.Namespace) -> None:
help="Skip transcript length correction.",
)

add_diploid_parx_genome(P_import_rna)
P_import_rna.set_defaults(func=_cmd_import_rna)


Expand Down
6 changes: 5 additions & 1 deletion cnvlib/rna.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,11 @@ def correct_cnr(cnr, do_gc, do_txlen, max_log2, diploid_parx_genome):
cnr = center_by_window(cnr, 0.1, cnr["gc"])
if do_txlen and "tx_length" in cnr:
cnr = center_by_window(cnr, 0.1, cnr["tx_length"])
cnr.center_all()
# Re-center after bias correction. Carry diploid_parx_genome through:
# GC/transcript-length window correction is invariant to a uniform
# pre-shift, so without it the earlier center_all(diploid_parx_genome)
# is silently undone and the flag becomes a no-op (cnvkit-d68).
cnr.center_all(diploid_parx_genome=diploid_parx_genome)
if max_log2:
cnr[cnr["log2"] > max_log2, "log2"] = max_log2
return cnr
17 changes: 17 additions & 0 deletions doc/rna.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,23 @@ Values below roughly 0.2 admit genes whose expression is detected in only a smal
minority of samples, so their log2 ratios are likely dominated by noise.


Pseudo-autosomal regions on chromosome X
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The pseudo-autosomal regions (PAR1 and PAR2) on chromosome X are diploid in both
sexes, so genes there behave like autosomal genes rather than sex-linked ones.
The ``--diploid-parx-genome`` option names a reference genome (e.g. ``grch38``)
whose PAR coordinates are then treated as autosomal when ``import-rna`` re-centers
each sample's log2 ratios::

cnvkit.py import-rna *.txt --gene-resource data/ensembl-gene-info.hg38.tsv \
--diploid-parx-genome grch38 --output-dir out/

Including the PAR-X bins in the autosomal pool shifts the centering value and so
changes the log2 output for the affected chromosome-X genes. When the option is
omitted (the default), output is unchanged.


Segmentation
------------

Expand Down
169 changes: 154 additions & 15 deletions test/test_rna.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pandas as pd

from cnvlib import commands, import_rna, rna
from cnvlib.cnary import CopyNumArray

logging.basicConfig(level=logging.ERROR, format="%(message)s")

Expand Down Expand Up @@ -361,28 +362,34 @@ def test_single_sample_uses_that_sample(self):
self.assertEqual(list(rna.filter_probes(counts).index), ["b", "c"])


def _ast_calls_to(source_obj, attr, value_id):
"""Collect ``value_id.attr(...)`` call nodes in ``source_obj``'s AST.

Shared by the kwarg-plumbing guards below, which assert a specific keyword
survives a CLI -> dispatcher -> leaf call chain without paying any fixture
cost.
"""
tree = ast.parse(inspect.getsource(source_obj))
return [
node
for node in ast.walk(tree)
if isinstance(node, ast.Call)
and isinstance(node.func, ast.Attribute)
and node.func.attr == attr
and isinstance(node.func.value, ast.Name)
and node.func.value.id == value_id
]


class FilterProbesPlumbingTests(unittest.TestCase):
"""``min_sample_fraction`` is threaded CLI -> do_import_rna -> filter_probes.

AST-level guards (cheap, no fixtures) so a dropped kwarg fails at collection
rather than silently reverting single-cell cohorts to the 0.5 default.
"""

@staticmethod
def _calls_to(source_obj, attr, value_id):
tree = ast.parse(inspect.getsource(source_obj))
return [
node
for node in ast.walk(tree)
if isinstance(node, ast.Call)
and isinstance(node.func, ast.Attribute)
and node.func.attr == attr
and isinstance(node.func.value, ast.Name)
and node.func.value.id == value_id
]

def test_do_import_rna_passes_fraction_to_filter_probes(self):
calls = self._calls_to(import_rna.do_import_rna, "filter_probes", "rna")
calls = _ast_calls_to(import_rna.do_import_rna, "filter_probes", "rna")
self.assertGreater(len(calls), 0)
for call in calls:
self.assertIn(
Expand All @@ -392,7 +399,7 @@ def test_do_import_rna_passes_fraction_to_filter_probes(self):
)

def test_cmd_import_rna_passes_fraction_to_do_import_rna(self):
calls = self._calls_to(commands._cmd_import_rna, "do_import_rna", "import_rna")
calls = _ast_calls_to(commands._cmd_import_rna, "do_import_rna", "import_rna")
self.assertGreater(len(calls), 0)
for call in calls:
self.assertIn(
Expand All @@ -418,6 +425,138 @@ def test_signatures_default_to_one_half(self):
)


class DiploidParxPlumbingTests(unittest.TestCase):
"""``diploid_parx_genome`` is threaded CLI -> do_import_rna -> correct_cnr.

The CLI layer was the missing link (cnvkit-d68): ``_cmd_import_rna`` never
forwarded ``args.diploid_parx_genome``, so PAR-aware centering was
unreachable from ``cnvkit import-rna``. AST guard so a dropped kwarg fails
at collection rather than silently disabling the flag again.
"""

def test_cmd_import_rna_forwards_parx_to_do_import_rna(self):
calls = _ast_calls_to(commands._cmd_import_rna, "do_import_rna", "import_rna")
self.assertGreater(len(calls), 0)
for call in calls:
self.assertIn(
"diploid_parx_genome",
{kw.arg for kw in call.keywords},
"_cmd_import_rna must forward args.diploid_parx_genome to "
"do_import_rna",
)

def test_cli_registers_diploid_parx_genome_flag(self):
"""The CLI parser exposes --diploid-parx-genome for import-rna."""
opt_strings = {
opt
for action in commands.P_import_rna._actions
for opt in action.option_strings
}
self.assertIn("--diploid-parx-genome", opt_strings)

def test_do_import_rna_signature_parx_default_none(self):
self.assertIsNone(
inspect.signature(import_rna.do_import_rna)
.parameters["diploid_parx_genome"]
.default,
)


def _make_rna_cnr(seed, *, include_par):
"""Build an hg38-style CopyNumArray for PAR-centering tests.

Autosomes (chr1) at log2 ~ 0, non-PAR chrX bins at log2 ~ -1, and -- when
``include_par`` -- chrX bins inside grch38 PAR1X (10000-2781479) at a
distinctly higher log2 ~ +1, so reclassifying them as autosomal visibly
moves the centering value. ``gc`` is a fraction in [0, 1] to match the real
``import-rna`` pipeline. Calling with the same ``seed`` yields identical
data, which ``correct_cnr`` mutates in place.
"""
rng = np.random.default_rng(seed)
rows = []
for i in range(20):
s = 100000 * (i + 1)
rows.append(("1", s, s + 1500, f"A{i}", rng.normal(0, 0.05), 100, 0.5, 1500))
if include_par:
for i in range(8):
s = 20000 + 100000 * i
rows.append(
(
"X",
s,
s + 1500,
f"PARX{i}",
1.0 + rng.normal(0, 0.05),
100,
0.5,
1500,
)
)
for i in range(8): # chrX outside grch38 PAR1/PAR2
s = 5_000_000 + 100000 * i
rows.append(
("X", s, s + 1500, f"X{i}", -1.0 + rng.normal(0, 0.05), 100, 0.5, 1500)
)
df = pd.DataFrame(
rows,
columns=[
"chromosome",
"start",
"end",
"gene",
"log2",
"depth",
"gc",
"tx_length",
],
)
return CopyNumArray(df, {"sample_id": "s"})


class DiploidParxCenteringTests(unittest.TestCase):
"""PAR-aware centering must actually shift output in the default path.

Regression guard for cnvkit-d68: ``correct_cnr`` applies GC/transcript-
length bias correction (on by default), which is invariant to a uniform
pre-shift. So unless the *second* ``center_all`` also carries
``diploid_parx_genome``, the flag is silently neutralized whenever GC/txlen
correction runs. Build an hg38-style cohort with PAR-X bins and assert the
flag changes chrX/PAR log2 -- while confirming it touches only PAR bins, so
the ``None`` default (``center_all(None)`` == the prior code path) is
unchanged by construction.
"""

def test_flag_shifts_par_centering_under_default_corrections(self):
"""With GC/txlen correction on (the default), grch38 != None output."""
without = rna.correct_cnr(
_make_rna_cnr(0, include_par=True), True, True, 3, None
)
with_parx = rna.correct_cnr(
_make_rna_cnr(0, include_par=True), True, True, 3, "grch38"
)
max_diff = np.abs(
without["log2"].to_numpy() - with_parx["log2"].to_numpy()
).max()
self.assertGreater(
max_diff,
1e-3,
"diploid_parx_genome must change centering even when GC/txlen "
"correction runs (second center_all must carry the flag)",
)

def test_none_path_unaffected_when_no_par_bins(self):
"""No PAR-X bins -> the flag is inert, confirming it touches only PAR."""
none_out = rna.correct_cnr(
_make_rna_cnr(1, include_par=False), True, True, 3, None
)
parx_out = rna.correct_cnr(
_make_rna_cnr(1, include_par=False), True, True, 3, "grch38"
)
np.testing.assert_array_equal(
none_out["log2"].to_numpy(), parx_out["log2"].to_numpy()
)


class ImportRnaIntegrationTests(unittest.TestCase):
"""End-to-end `import-rna` from count files + gene resource to .cnr."""

Expand Down