Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
v0.20.3

- Add chromsizes tileset_info function

v0.20.2

- Convert cooler chromsizes to int64 to prevent overflow error with recent versions of h5py and numpy
Expand Down
10 changes: 10 additions & 0 deletions clodius/models/tileset_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from typing import List, Optional

from pydantic import BaseModel


class TilesetInfo(BaseModel):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see this is only used in the test. Should that be in the test module only or is this intended as a part of the public API?

max_width: int
min_pos: List[int]
max_pos: List[int]
chromsizes: Optional[List]
48 changes: 37 additions & 11 deletions clodius/tiles/chromsizes.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,61 @@
import csv
import logging
from smart_open import open

logger = logging.getLogger(__name__)


def get_tsv_chromsizes(filename):
def tileset_info(filename: str) -> dict:
"""Return a standard higlass tileset info object that contains
chromsizes as an element.

The chromsizes in the returned object will be a list of [name, size]
tuples.

[
['chr1', 1000],
['chr2', 2000]
]
"""
chromsizes = get_tsv_chromsizes(filename)

max_width = sum([int(c[1]) for c in chromsizes])
return {
"max_width": max_width,
"chromsizes": [[c[0], int(c[1])] for c in chromsizes],
"min_pos": [0],
"max_pos": [max_width],
}


def get_tsv_chromsizes(file):
"""
Get a list of chromosome sizes from this [presumably] tsv
chromsizes file file.
chromsizes file.

Parameters:
-----------
filename: string
The filename of the tsv file
file: string or file-like object
A file-like object

Returns
-------
chromsizes: [(name:string, size:int), ...]
An ordered list of chromosome names and sizes
"""
if isinstance(file, str):
file = open(file, "rb")

try:
with open(filename, "r") as f:
reader = csv.reader(f, delimiter="\t")
file.seek(0)
binary_data = file.read()
text_data = binary_data.decode("utf-8")

data = []
for row in reader:
data.append(row)
lines = text_data.split("\n")
data = [line.strip().split("\t") for line in lines if line.strip()]
return data
except Exception as ex:
logger.error(ex)

err_msg = "WHAT?! Could not load file %s. 😤 (%s)" % (filename, ex)
err_msg = "WHAT?! Could not load file %s." % (ex)

raise Exception(err_msg)
1 change: 1 addition & 0 deletions get_test_data.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/chromSizes.tsv
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/all.KL.bed.multires.mv5
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/Dixon2012-J1-NcoI-R1-filtered.100kb.multires.cool
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/hic-resolutions.cool
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dependencies = [
"slugid",
"sortedcontainers",
"tqdm",
"smart_open"
]
license = { text = "MIT" }
readme = "README.md"
Expand Down
21 changes: 21 additions & 0 deletions test/tiles/chromsizes_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os.path as op

import clodius.tiles.chromsizes as ctcs
from clodius.models.tileset_info import TilesetInfo


def test_get_tileset_info():
filename = op.join("data", "chromSizes.tsv")

# Test loading tileset info using a filename
tsinfo = TilesetInfo(**ctcs.tileset_info(filename))

assert tsinfo.max_width > 100
assert len(tsinfo.chromsizes) > 2

with open(filename, "rb") as f:
# Test loading using a file-like object
tsinfo = TilesetInfo(**ctcs.tileset_info(f))

assert tsinfo.max_width > 100
assert len(tsinfo.chromsizes) > 2