Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ shlib-exports-*.txt
/test/sam
/test/ref_cache/*.tmp.*
/test/tabix/*.tmp.*
/test/bench_tbx_regions
/test/test_alloc
/test/test-bcf-sr
/test/test-bcf-translate
Expand All @@ -78,6 +79,7 @@ shlib-exports-*.txt
/test/test_realn
/test/test-regidx
/test/test_str2int
/test/test_tbx_multi
/test/test_time_funcs
/test/test-vcf-api
/test/test-vcf-sweep
Expand Down
12 changes: 11 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,9 @@ BUILT_TEST_PROGRAMS = \
test/test-parse-reg \
test/test_introspection \
test/test-bcf_set_variant_type \
test/test_hfile_libcurl
test/test_hfile_libcurl \
test/test_tbx_multi \
test/bench_tbx_regions

BUILT_THRASH_PROGRAMS = \
test/thrash_threads1 \
Expand Down Expand Up @@ -808,6 +810,12 @@ test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a
test/test-bcf-sr: test/test-bcf-sr.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test-bcf-sr.o libhts.a $(LIBS) -lpthread

test/test_tbx_multi: test/test_tbx_multi.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_tbx_multi.o libhts.a $(LIBS) -lpthread

test/bench_tbx_regions: test/bench_tbx_regions.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/bench_tbx_regions.o libhts.a $(LIBS) -lpthread

test/test-bcf-translate: test/test-bcf-translate.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a $(LIBS) -lpthread

Expand Down Expand Up @@ -897,6 +905,8 @@ test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h)
test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_hts_alloc_h) $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_kstring_h) $(htslib_kseq_h)
test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h)
test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_hts_alloc_h) $(htslib_hts_defs_h) $(htslib_synced_bcf_reader_h) $(htslib_hts_h) $(htslib_vcf_h)
test/test_tbx_multi.o: test/test_tbx_multi.c config.h $(htslib_hts_h) $(htslib_tbx_h) $(htslib_kstring_h)
test/bench_tbx_regions.o: test/bench_tbx_regions.c config.h $(htslib_hts_h) $(htslib_tbx_h) $(htslib_kstring_h)
test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h)
test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h)
test/test-bcf_set_variant_type.o: test/test-bcf_set_variant_type.c config.h $(htslib_hts_h) vcf.c
Expand Down
11 changes: 11 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
Noteworthy changes in release a.b
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

API additions
-------------

* Added tbx_itr_regions(), a multi-region iterator constructor for tabix
indexed text files (VCF.gz, BED, GFF, custom TSV). Returns an hts_itr_t
that, when driven by hts_itr_multi_next, yields records from all listed
regions in genome order with coalesced index lookups. Meaningfully
faster than calling tbx_itr_queryi() once per region when the region
list is large or dense. Mirrors how hts_itr_regions() already works for
BAM and CRAM. samtools/bcftools#2557.

Noteworthy changes in release 1.23.1 (18th March 2026)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
8 changes: 8 additions & 0 deletions hts_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,14 @@ const char *hts_plugin_path(void);
*/
int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped);

/*
* qsort comparator for hts_pair_pos_t intervals, sorting by (beg, end)
* ascending. Exposed because both region.c (sort within parsed reglists)
* and tbx.c (multi-region tabix iterator: hts_itr_multi_bam requires
* intervals to be sorted) need this comparator with identical behavior.
*/
int compare_hts_pair_pos_t(const void *av, const void *bv);

static inline int find_file_extension(const char *fn, char ext_out[static HTS_MAX_EXT_LEN])
{
const char *delim = fn ? strstr(fn, HTS_IDX_DELIM) : NULL, *ext;
Expand Down
54 changes: 54 additions & 0 deletions htslib/tbx.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,60 @@ extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sa
HTSLIB_EXPORT
int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end);

/// Construct a multi-region iterator over a tabix indexed text file.
/** Returns an hts_itr_t that, when driven by hts_itr_multi_next, yields
records from all the regions in reglist in genome order. Adjacent
and nearby regions are coalesced into combined tabix index lookups
by hts_itr_multi_next, so this is meaningfully faster than calling
tbx_itr_queryi once per region when the region list is large or
dense. See test/bench_tbx_regions.c for a reproducible measurement
of the speedup curve.

Reglist entries may specify their target contig either way:
- reglist[i].tid set to a tid (use tbx_name2id() to resolve)
with reglist[i].reg == NULL, or
- reglist[i].reg set to a reference name string such as "chr1"
(or "." for all-with-coords records, "*" for unmapped),
which this function resolves internally via tbx_name2id().
Each reglist[i].intervals must hold reglist[i].count
hts_pair_pos_t entries.

Semantics differ from running tbx_itr_queryi() multiple times in
one respect: each underlying file record is yielded at most once,
even when multiple intervals in reglist cover it. Duplicate or
overlapping intervals produce a single emission per matching
record, not one per matching interval. Callers that need
per-interval multiplicity must call tbx_itr_queryi() per interval.

Ownership: the caller's reglist is deep-copied internally and is
never mutated or freed by this function regardless of outcome.
The caller retains ownership of reglist (and is responsible for
freeing it) as well as of tbx and fp. tbx must remain valid for
the lifetime of the iterator. Destroy the iterator with
hts_itr_destroy.

Concurrency: only one multi-region tabix iterator may be active
on a given htsFile at a time. The iterator uses the BGZF
private_data slot on fp to thread tbx_t through hts_itr_multi_next;
constructing a second multi-region iterator on the same fp before
destroying the first would clobber that slot and yield wrong
records from the first. Single-region iterators (tbx_itr_queryi)
on the same fp are unaffected. This matches the existing
constraint on binary BCF iteration, which uses the same slot.

Lifetime: destroy the iterator (hts_itr_destroy) before destroying
the tbx (tbx_destroy) or closing the fp (hts_close). The iterator
retains a pointer to tbx via the BGZF private_data slot; destroying
tbx first leaves a dangling pointer in fp's BGZF cache that would
be returned by a later bgzf_get_private_data on the same fp.

Returns NULL if fp, tbx, or reglist is NULL; if count <= 0; if
fp is not BGZF; if memory cannot be allocated; or if the iterator
cannot otherwise be constructed. */
HTSLIB_EXPORT
hts_itr_t *tbx_itr_regions(htsFile *fp, tbx_t *tbx,
hts_reglist_t *reglist, int count);

/// Build an index of the lines in a BGZF-compressed file
/** The index struct returned by a successful call should be freed
via tbx_destroy() when it is no longer needed.
Expand Down
2 changes: 1 addition & 1 deletion region.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ typedef struct reglist
KHASH_MAP_INIT_INT(reg, reglist_t)
typedef kh_reg_t reghash_t;

static int compare_hts_pair_pos_t (const void *av, const void *bv)
int compare_hts_pair_pos_t (const void *av, const void *bv)
{
hts_pair_pos_t *a = (hts_pair_pos_t *) av;
hts_pair_pos_t *b = (hts_pair_pos_t *) bv;
Expand Down
201 changes: 195 additions & 6 deletions tbx.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */
#include <errno.h>
#include "htslib/tbx.h"
#include "htslib/bgzf.h"
#include "bgzf_internal.h"
#include "htslib/hts_alloc.h"
#include "htslib/hts_endian.h"
#include "hts_internal.h"
Expand Down Expand Up @@ -351,19 +352,21 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_
* -1 on EOF
* <= -2 on error
*/
int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end)
/* Shared body between tbx_readrec (single region path) and tbx_readrec_multi
* (multi region path). Reads the next non-meta line from fp, parses it via
* get_intv, and returns the interval coordinates. Centralising the body here
* means that future fixes to meta_char handling or get_intv parsing apply to
* both paths automatically. */
static int tbx_readrec_impl(BGZF *fp, tbx_t *tbx, kstring_t *s,
int *tid, hts_pos_t *beg, hts_pos_t *end)
{
tbx_t *tbx = (tbx_t *) tbxv;
kstring_t *s = (kstring_t *) sv;
int ret;

// Get a line until either EOF or a non-meta character
do {
ret = bgzf_getline(fp, '\n', s);
} while (ret >= 0 && s->l && *s->s == tbx->conf.meta_char);

// Parse line
if (ret >= 0) {
if (ret >= 0) {
tbx_intv_t intv;
if (get_intv(tbx, s, &intv, 0) < 0)
return -2;
Expand All @@ -373,6 +376,192 @@ int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_po
return ret;
}

int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end)
{
return tbx_readrec_impl(fp, (tbx_t *) tbxv, (kstring_t *) sv, tid, beg, end);
}

/* Internal helpers for the multi-region tabix iterator. Mirror the
* sam.c bam_pseek/bam_ptell pattern but parameterised on BGZF rather than
* any BAM-specific structure. */
static int tbx_pseek(void *fp, int64_t offset, int where)
{
return bgzf_seek((BGZF *) fp, offset, where);
}

static int64_t tbx_ptell(void *fp)
{
BGZF *bgzf = (BGZF *) fp;
if (!bgzf) return -1L;
return bgzf_tell(bgzf);
}

/* Multi-region readrec callback. Retrieves tbx_t from BGZF private_data
* because hts_itr_multi_next passes the htsFile pointer as the second
* argument and has no mechanism to thread a tabix-specific context through.
* Mirrors how bcf_readrec retrieves bcf_hdr_t from BGZF private_data. */
static int tbx_readrec_multi(BGZF *fp, void *ignored, void *sv,
int *tid, hts_pos_t *beg, hts_pos_t *end)
{
tbx_t *tbx = (tbx_t *) bgzf_get_private_data(fp);
if (!tbx) {
/* The wrapper tbx_itr_regions always sets private_data before
* returning the iterator. Reaching here means either the slot
* was cleared mid-iteration (caller violated the single-iterator
* precondition) or the iterator was driven outside its lifetime.
* Return -2 rather than -1 so the caller sees an error rather
* than a silent end-of-stream. */
hts_log_error("Multi-region tabix iterator has no BGZF private_data;"
" concurrent multi-region iterators on the same"
" htsFile are not supported");
return -2;
}
return tbx_readrec_impl(fp, tbx, (kstring_t *) sv, tid, beg, end);
}

/* Deep copy a caller-provided reglist for internal use by tbx_itr_regions.
*
* Why deep copy: hts_itr_regions's ownership semantics on failure are
* inconsistent (NULL return may or may not have freed the reglist depending
* on which branch failed). By giving it our own owned copy, the caller's
* input is never touched, so they always own what they passed in regardless
* of outcome.
*
* Why resolve .reg here: hts_itr_regions invokes getid(hdr, reg) whenever
* reglist[i].reg is non-NULL. We pass getid=NULL, so we must clear .reg on
* the copy after pre-resolving it via tbx_name2id. This lets callers pass
* either pre-resolved tids or reference name strings, matching how
* sam_itr_regions handles getid internally.
*
* Returns 0 on success (*out points to a freshly allocated reglist that
* the caller must release via hts_reglist_free) or -1 on allocation
* failure. */
static int tbx_reglist_dup(tbx_t *tbx, const hts_reglist_t *src, int count,
hts_reglist_t **out)
{
hts_reglist_t *dst;
int i;

dst = calloc(count, sizeof(*dst));
if (!dst) return -1;

for (i = 0; i < count; i++) {
uint32_t n, k;
hts_pos_t mn, mx;

dst[i].tid = src[i].tid;
dst[i].count = src[i].count;
dst[i].min_beg = src[i].min_beg;
dst[i].max_end = src[i].max_end;
dst[i].reg = NULL; /* always pre-resolved below */
dst[i].intervals = NULL;

if (src[i].reg) {
if (!strcmp(src[i].reg, ".")) dst[i].tid = HTS_IDX_START;
else if (!strcmp(src[i].reg, "*")) dst[i].tid = HTS_IDX_NOCOOR;
else {
int tid = tbx_name2id(tbx, src[i].reg);
if (tid < 0)
hts_log_warning("Region '%s' refers to an unknown"
" reference name", src[i].reg);
dst[i].tid = tid;
}
}

/* An entry with count > 0 but intervals == NULL (or vice versa) is
* an inconsistent caller-side state; treat as having no intervals
* so we never hand hts_itr_multi_bam a non-zero count with a NULL
* intervals array, which it would dereference and crash on. */
if (!src[i].intervals || src[i].count == 0) {
dst[i].count = 0;
continue;
}

/* Read src[i].count into a local once and use the local for size
* computation, allocation, and bookkeeping. Avoids any TOCTOU
* mismatch between the size we allocate and the size we believe
* dst holds, even though callers are not expected to mutate src
* concurrently. */
n = src[i].count;
dst[i].intervals = malloc((size_t) n * sizeof(hts_pair_pos_t));
if (!dst[i].intervals) {
hts_reglist_free(dst, count);
return -1;
}
memcpy(dst[i].intervals, src[i].intervals,
(size_t) n * sizeof(hts_pair_pos_t));
dst[i].count = n;

/* Recompute min_beg / max_end from the copied intervals so they
* remain consistent even if the caller's summary fields were
* stale or if the later qsort reorders intervals[0]. */
mn = dst[i].intervals[0].beg;
mx = dst[i].intervals[0].end;
for (k = 1; k < n; k++) {
if (dst[i].intervals[k].beg < mn) mn = dst[i].intervals[k].beg;
if (dst[i].intervals[k].end > mx) mx = dst[i].intervals[k].end;
}
dst[i].min_beg = mn;
dst[i].max_end = mx;
}

*out = dst;
return 0;
}

hts_itr_t *tbx_itr_regions(htsFile *fp, tbx_t *tbx,
hts_reglist_t *reglist, int count)
{
BGZF *bgzf;
hts_reglist_t *owned = NULL;
hts_itr_t *itr;
int i;

if (!fp || !tbx || !reglist || count <= 0) return NULL;
bgzf = hts_get_bgzfp(fp);
if (!bgzf) return NULL;

/* Make our own copy of the reglist so the caller's input is never
* mutated and never assumed to be live past this call. The copy also
* lets us pre-resolve any .reg name strings to tids so we can pass
* getid=NULL safely to hts_itr_regions below. */
if (tbx_reglist_dup(tbx, reglist, count, &owned) < 0)
return NULL;

/* hts_itr_multi_bam (the itr_specific used here) requires intervals
* within each reglist entry to be sorted by start position; its
* advance logic uses the interval index packed into the offset list
* which relies on sorted iteration. Sort our copy now. */
for (i = 0; i < count; i++) {
if (owned[i].intervals && owned[i].count > 1) {
qsort(owned[i].intervals, owned[i].count,
sizeof(hts_pair_pos_t), compare_hts_pair_pos_t);
}
}

itr = hts_itr_regions(tbx->idx, owned, count, NULL, NULL,
hts_itr_multi_bam, tbx_readrec_multi,
tbx_pseek, tbx_ptell);
if (!itr) {
/* hts_itr_regions has exactly one failure path that does not
* free reglist via hts_itr_destroy: the OOM at its initial
* calloc(itr) before it ever assigns reg_list. Every other
* failure (itr_specific failure, getid failure) frees 'owned'.
* We accept the small leak in the calloc-OOM case rather than
* duplicate hts_itr_regions's allocation path here, because
* the caller's original reglist is untouched either way. */
return NULL;
}

/* Stash the tbx_t in BGZF private_data so tbx_readrec_multi can
* retrieve it on each call. The slot is otherwise unused for tabix
* indexed text files; only bcf_hdr_read uses it, for binary BCF.
* Caller retains ownership of tbx, which must remain valid for the
* lifetime of the iterator. */
bgzf_set_private_data(bgzf, tbx, NULL);
return itr;
}

static int tbx_set_meta(tbx_t *tbx)
{
int i, l = 0, l_nm;
Expand Down
Loading