diff --git a/CHANGELOG.md b/CHANGELOG.md index 8526721..916c2ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +v0.20.3 + +- Add chromsizes tileset_info function + v0.20.2 - Convert cooler chromsizes to int64 to prevent overflow error with recent versions of h5py and numpy diff --git a/clodius/models/tileset_info.py b/clodius/models/tileset_info.py new file mode 100644 index 0000000..6e63ba8 --- /dev/null +++ b/clodius/models/tileset_info.py @@ -0,0 +1,10 @@ +from typing import List, Optional + +from pydantic import BaseModel + + +class TilesetInfo(BaseModel): + max_width: int + min_pos: List[int] + max_pos: List[int] + chromsizes: Optional[List] diff --git a/clodius/tiles/chromsizes.py b/clodius/tiles/chromsizes.py index 993d64c..d330782 100644 --- a/clodius/tiles/chromsizes.py +++ b/clodius/tiles/chromsizes.py @@ -1,35 +1,61 @@ -import csv import logging +from smart_open import open logger = logging.getLogger(__name__) -def get_tsv_chromsizes(filename): +def tileset_info(filename: str) -> dict: + """Return a standard higlass tileset info object that contains + chromsizes as an element. + + The chromsizes in the returned object will be a list of [name, size] + tuples. + + [ + ['chr1', 1000], + ['chr2', 2000] + ] + """ + chromsizes = get_tsv_chromsizes(filename) + + max_width = sum([int(c[1]) for c in chromsizes]) + return { + "max_width": max_width, + "chromsizes": [[c[0], int(c[1])] for c in chromsizes], + "min_pos": [0], + "max_pos": [max_width], + } + + +def get_tsv_chromsizes(file): """ Get a list of chromosome sizes from this [presumably] tsv - chromsizes file file. + chromsizes file. Parameters: ----------- - filename: string - The filename of the tsv file + file: string or file-like object + A file-like object Returns ------- chromsizes: [(name:string, size:int), ...] An ordered list of chromosome names and sizes """ + if isinstance(file, str): + file = open(file, "rb") + try: - with open(filename, "r") as f: - reader = csv.reader(f, delimiter="\t") + file.seek(0) + binary_data = file.read() + text_data = binary_data.decode("utf-8") - data = [] - for row in reader: - data.append(row) + lines = text_data.split("\n") + data = [line.strip().split("\t") for line in lines if line.strip()] return data except Exception as ex: logger.error(ex) - err_msg = "WHAT?! Could not load file %s. 😤 (%s)" % (filename, ex) + err_msg = "WHAT?! Could not load file %s." % (ex) raise Exception(err_msg) diff --git a/get_test_data.sh b/get_test_data.sh index 8ce5412..246f6b8 100755 --- a/get_test_data.sh +++ b/get_test_data.sh @@ -1,3 +1,4 @@ +wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/chromSizes.tsv wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/all.KL.bed.multires.mv5 wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/Dixon2012-J1-NcoI-R1-filtered.100kb.multires.cool wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/hic-resolutions.cool diff --git a/pyproject.toml b/pyproject.toml index e2f8966..92a78ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "slugid", "sortedcontainers", "tqdm", + "smart_open" ] license = { text = "MIT" } readme = "README.md" diff --git a/test/tiles/chromsizes_test.py b/test/tiles/chromsizes_test.py new file mode 100644 index 0000000..5fa5d76 --- /dev/null +++ b/test/tiles/chromsizes_test.py @@ -0,0 +1,21 @@ +import os.path as op + +import clodius.tiles.chromsizes as ctcs +from clodius.models.tileset_info import TilesetInfo + + +def test_get_tileset_info(): + filename = op.join("data", "chromSizes.tsv") + + # Test loading tileset info using a filename + tsinfo = TilesetInfo(**ctcs.tileset_info(filename)) + + assert tsinfo.max_width > 100 + assert len(tsinfo.chromsizes) > 2 + + with open(filename, "rb") as f: + # Test loading using a file-like object + tsinfo = TilesetInfo(**ctcs.tileset_info(f)) + + assert tsinfo.max_width > 100 + assert len(tsinfo.chromsizes) > 2