Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
v0.20.3

- Add chromsizes tileset_info function

v0.20.2

- Convert cooler chromsizes to int64 to prevent overflow error with recent versions of h5py and numpy
Expand Down
10 changes: 10 additions & 0 deletions clodius/models/tileset_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from typing import List, Optional

from pydantic import BaseModel


class TilesetInfo(BaseModel):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see this is only used in the test. Should that be in the test module only or is this intended as a part of the public API?

max_width: int
min_pos: List[int]
max_pos: List[int]
chromsizes: Optional[List]
48 changes: 37 additions & 11 deletions clodius/tiles/chromsizes.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,61 @@
import csv
import logging
from smart_open import open

logger = logging.getLogger(__name__)


def get_tsv_chromsizes(filename):
def tileset_info(filename: str) -> dict:
"""Return a standard higlass tileset info object that contains
chromsizes as an element.

The chromsizes in the returned object will be a list of [name, size]
tuples.

[
['chr1', 1000],
['chr2', 2000]
]
"""
chromsizes = get_tsv_chromsizes(filename)

max_width = sum([int(c[1]) for c in chromsizes])
return {
"max_width": max_width,
"chromsizes": [[c[0], int(c[1])] for c in chromsizes],
"min_pos": [0],
"max_pos": [max_width],
}


def get_tsv_chromsizes(file):
"""
Get a list of chromosome sizes from this [presumably] tsv
chromsizes file file.
chromsizes file.

Parameters:
-----------
filename: string
The filename of the tsv file
file: string or file-like object
A file-like object

Returns
-------
chromsizes: [(name:string, size:int), ...]
An ordered list of chromosome names and sizes
"""
if isinstance(file, str):
file = open(file, "rb")

try:
with open(filename, "r") as f:
reader = csv.reader(f, delimiter="\t")
file.seek(0)
binary_data = file.read()
text_data = binary_data.decode("utf-8")

data = []
for row in reader:
data.append(row)
lines = text_data.split("\n")
data = [line.strip().split("\t") for line in lines if line.strip()]
return data
except Exception as ex:
logger.error(ex)

err_msg = "WHAT?! Could not load file %s. 😤 (%s)" % (filename, ex)
err_msg = "WHAT?! Could not load file %s." % (ex)

raise Exception(err_msg)
1 change: 1 addition & 0 deletions get_test_data.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/chromSizes.tsv
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/all.KL.bed.multires.mv5
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/Dixon2012-J1-NcoI-R1-filtered.100kb.multires.cool
wget -q -NP data/ https://s3.amazonaws.com/pkerp/public/hic-resolutions.cool
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dependencies = [
"slugid",
"sortedcontainers",
"tqdm",
"smart_open"
]
license = { text = "MIT" }
readme = "README.md"
Expand Down
22 changes: 22 additions & 0 deletions test/tiles/chromsizes_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os.path as op

import clodius.tiles.chromsizes as ctcs
from clodius.models.tileset_info import TilesetInfo


def test_get_tileset_info():
filename = op.join("data", "chromSizes.tsv")

# Test loading tileset info using a filename
tsinfo = TilesetInfo(**ctcs.tileset_info(filename))

assert tsinfo.max_width > 100
assert len(tsinfo.chromsizes) > 2
# TODO: Do something with the return value
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parsing with the pydantic model is a kind of assertion here, is that what you were thinking wrt to "return value'?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I meant to remove that line. Removing in the next commit.


with open(filename, "rb") as f:
# Test loading using a file-like object
tsinfo = TilesetInfo(**ctcs.tileset_info(f))

assert tsinfo.max_width > 100
assert len(tsinfo.chromsizes) > 2