From f027f5bb34c04ccb103e2f4e528bd52e4f80d7b7 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 22 May 2026 17:21:15 +0100 Subject: [PATCH 1/3] enum bcf_sr_error fix and additions As enum bcf_sr_error didn't explicitly set an values, the first entry (open_failed) got the value zero. As a check against zero is suggested in example code (and also used frequently in bcftools) to test for errors, it was not possible to distinguish open_failed from no error. Fix by deprecating open_failed and replacing it by a new non-zero value `bcf_sr_open_failed` to cover the case. A new entry bcf_sr_ok is added to make the OK case explicit. New entries bcf_sr_seek_error, bcf_sr_regions_error, and bcf_sr_samples_error are added for use by later updates. Signed-off-by: Rob Davies --- htslib/synced_bcf_reader.h | 18 ++++++++++++++++-- synced_bcf_reader.c | 8 +++++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index a81ac371d..38c26a868 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -155,8 +155,22 @@ bcf_sr_t; typedef enum { - open_failed, not_bgzf, idx_load_failed, file_type_error, api_usage_error, - header_error, no_eof, no_memory, vcf_parse_error, bcf_read_error, noidx_error + bcf_sr_ok = 0, // No error + open_failed HTS_DEPRECATED_ENUM("Use bcf_sr_open_failed instead") = 0, + not_bgzf, // Input file not BGZF compressed + idx_load_failed, // Couldn't load the index + file_type_error, // Unsupported file type + api_usage_error, // API used incorrectly + header_error, // Header could not be read + no_eof, // Input file does not have an EOF block + no_memory, // Ran out of memory + vcf_parse_error, // Invalid VCF line + bcf_read_error, // Error reading file + noidx_error, // Input file not indexed, and streaming not possible + bcf_sr_open_failed, // Failed to open the input file + bcf_sr_seek_error, // Failed to seek to next location + bcf_sr_regions_error, // Failed to read the regions list + bcf_sr_samples_error, // Failed to read the samples list } bcf_sr_error; diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index d7780d6e1..cd39921f9 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -86,7 +86,7 @@ char *bcf_sr_strerror(int errnum) { switch (errnum) { - case open_failed: + case bcf_sr_open_failed: return strerror(errno); case not_bgzf: return "not compressed with bgzip"; @@ -108,6 +108,12 @@ char *bcf_sr_strerror(int errnum) return "BCF read error"; case noidx_error: return "merge of unindexed files failed"; + case bcf_sr_seek_error: + return "failed to seek to next location"; + case bcf_sr_regions_error: + return "failed to read regions/targets list"; + case bcf_sr_samples_error: + return "failed to read samples list"; default: return ""; } } From ed0c5277756bd002e2015fd50efd0bd1d0b46f7f Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 22 May 2026 17:34:09 +0100 Subject: [PATCH 2/3] Improve synced_bcf_reader error reporting Mainly adds tests to ensure that memory allocations worked, along with places where hts_getline() failure may have been missed before. Ensures that errors propagate up to callers, and that bcf_srs_t::errnum is set consistently. Where possible, structures are left in a consistent state when recovering from an allocation failure. For example, fields recording how much memory was allocated are only updated after realloc() succeeds, so it does not appear that more space is available than is really the case. Fixes a couple of places where the synced reader would call exit(1) rather than return an error code to the caller. This does mean callers may have to become better at checking return values. Signed-off-by: Rob Davies --- bcf_sr_sort.c | 194 +++++++++++----- htslib/synced_bcf_reader.h | 59 ++--- synced_bcf_reader.c | 460 +++++++++++++++++++++++++++++-------- 3 files changed, 524 insertions(+), 189 deletions(-) diff --git a/bcf_sr_sort.c b/bcf_sr_sort.c index 9f5e6f3a1..ad0d1813b 100644 --- a/bcf_sr_sort.c +++ b/bcf_sr_sort.c @@ -107,8 +107,8 @@ static int multi_is_exact(var_t *avar, var_t *bvar) { if ( avar->nalt != bvar->nalt ) return 0; - int alen = strlen(avar->str); - int blen = strlen(bvar->str); + size_t alen = strlen(avar->str); + size_t blen = strlen(bvar->str); if ( alen != blen ) return 0; char *abeg = avar->str; @@ -214,11 +214,15 @@ static int merge_vsets(sr_sort_t *srt, int ivset, int jvset) varset_t *iv = &srt->vset[ivset]; varset_t *jv = &srt->vset[jvset]; + if (INT_MAX - iv->nvar < jv->nvar) + return -1; + kbs_bitwise_or(iv->mask,jv->mask); i = iv->nvar; + if (hts_resize(int, iv->nvar + jv->nvar, &iv->mvar, &iv->var, 0) < 0) + return -1; iv->nvar += jv->nvar; - hts_expand(int, iv->nvar, iv->mvar, iv->var); for (j=0; jnvar; j++,i++) iv->var[i] = jv->var[j]; int *imat = srt->pmat + ivset*srt->ngrp; @@ -238,9 +242,10 @@ static int push_vset(sr_sort_t *srt, int ivset) for (i=0; isr->nreaders; i++) { vcf_buf_t *buf = &srt->vcf_buf[i]; + if (hts_resize(bcf1_t*,buf->nrec + 1,&buf->mrec,&buf->rec,0) < 0) + return -1; + buf->rec[buf->nrec] = NULL; buf->nrec++; - hts_expand(bcf1_t*,buf->nrec,buf->mrec,buf->rec); - buf->rec[buf->nrec-1] = NULL; } for (i=0; invar; i++) { @@ -253,7 +258,7 @@ static int push_vset(sr_sort_t *srt, int ivset) } } remove_vset(srt, ivset); - return 0; // FIXME: check for errs in this function + return 0; } static int cmpstringp(const void *p1, const void *p2) @@ -303,9 +308,11 @@ void debug_vbuf(sr_sort_t *srt) static char *grp_create_key(sr_sort_t *srt) { - if ( !srt->str.l ) return strdup(""); - int i; - hts_expand(char*,srt->noff,srt->mcharp,srt->charp); + if ( !srt->str.l || srt->noff <= 0) return strdup(""); + size_t i; + if (hts_resize(char*,srt->noff,&srt->mcharp,&srt->charp,0) < 0) { + return NULL; + } for (i=0; inoff; i++) { srt->charp[i] = srt->str.s + srt->off[i]; @@ -317,27 +324,27 @@ static char *grp_create_key(sr_sort_t *srt) return NULL; for (i=0; inoff; i++) { - int len = strlen(srt->charp[i]); + size_t len = strlen(srt->charp[i]); memcpy(ptr, srt->charp[i], len); + ptr[len] = i+1==srt->noff ? 0 : ';'; ptr += len + 1; - ptr[-1] = i+1==srt->noff ? 0 : ';'; } return ret; } -int bcf_sr_sort_set_active(sr_sort_t *srt, int idx) -{ - hts_expand(int,idx+1,srt->mactive,srt->active); - srt->nactive = 1; - srt->active[srt->nactive - 1] = idx; - return 0; // FIXME: check for errs in this function -} int bcf_sr_sort_add_active(sr_sort_t *srt, int idx) { - hts_expand(int,idx+1,srt->mactive,srt->active); + if (hts_resize(int,srt->nactive+1,&srt->mactive,&srt->active,0) < 0) { + return -1; + } srt->nactive++; srt->active[srt->nactive - 1] = idx; return 0; // FIXME: check for errs in this function } +int bcf_sr_sort_set_active(sr_sort_t *srt, int idx) +{ + srt->nactive = 0; + return bcf_sr_sort_add_active(srt, idx); +} static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos) { if ( !srt->grp_str2int ) @@ -351,6 +358,8 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, bcf_sr_init_scores(srt); srt->grp_str2int = khash_str2int_init(); srt->var_str2int = khash_str2int_init(); + if (!srt->grp_str2int || !srt->var_str2int) + goto nomem; } int k; khash_t(str2int) *hash; @@ -376,29 +385,31 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, bcf_sr_t *reader = &readers->readers[ireader]; int rid = bcf_hdr_name2id(reader->header, chr); grp.nvar = 0; - hts_expand(int,reader->nbuffer,srt->moff,srt->off); + if (hts_resize(int,reader->nbuffer,&srt->moff,&srt->off,0) < 0) + goto nomem; srt->noff = 0; srt->str.l = 0; for (irec=1; irec<=reader->nbuffer; irec++) { + int e = 0; bcf1_t *line = reader->buffer[irec]; if ( line->rid!=rid || line->pos!=min_pos ) break; - if ( srt->str.l ) kputc(';',&srt->str); + if ( srt->str.l ) e |= kputc(';',&srt->str) < 0; srt->off[srt->noff++] = srt->str.l; size_t beg = srt->str.l; int end_pos = -1; if ( srt->pair & BCF_SR_PAIR_ID ) { - kputs(line->d.id,&srt->str); - kputc(':',&srt->str); + e |= kputs(line->d.id,&srt->str) < 0; + e |= kputc(':',&srt->str) < 0; } for (ivar=1; ivarn_allele; ivar++) { - if ( ivar>1 ) kputc(',',&srt->str); - kputs(line->d.allele[0],&srt->str); - kputc('>',&srt->str); - kputs(line->d.allele[ivar],&srt->str); + if ( ivar>1 ) e |= kputc(',',&srt->str) < 0; + e |= kputs(line->d.allele[0],&srt->str) < 0; + e |= kputc('>',&srt->str) < 0; + e |= kputs(line->d.allele[ivar],&srt->str) < 0; // If symbolic allele, check also the END tag in case there are multiple events, // such as s, starting at the same positions @@ -414,17 +425,20 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, } if ( end_pos ) { - kputc('/',&srt->str); - kputw(end_pos, &srt->str); + e |= kputc('/',&srt->str) < 0; + e |= kputw(end_pos, &srt->str) < 0; } } } if ( line->n_allele==1 ) { - kputs(line->d.allele[0],&srt->str); - kputsn(">.",2,&srt->str); + e |= kputs(line->d.allele[0],&srt->str) < 0; + e |= kputsn(">.",2,&srt->str) < 0; } + if (e) + goto nomem; + // Create new variant or attach to existing one. But careful, there can be duplicate // records with the same POS,REF,ALT (e.g. in dbSNP-b142). In such case, use a // hash table (srt->var_str2int) and a counter (var_idx) to ensure they are @@ -441,47 +455,82 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, if ( var->vcf[var->nvcf-1] != ireader ) break; srt->str.l = var_end; - kputw(var_idx, &srt->str); + if (kputw(var_idx, &srt->str) < 0) + goto nomem; var_str = beg + srt->str.s; var_idx++; } if ( ret==-1 ) { // the variant is not present, insert - ivar = srt->nvar++; - hts_expand0(var_t,srt->nvar,srt->mvar,srt->var); + if (hts_resize(var_t,srt->nvar+1,&srt->mvar,&srt->var,HTS_RESIZE_CLEAR) < 0) + goto nomem; + char *var_str_dup = strdup(var_str); + if (!var_str_dup) + goto nomem; + ivar = srt->nvar; srt->var[ivar].nvcf = 0; - khash_str2int_set(srt->var_str2int, strdup(var_str), ivar); + if (khash_str2int_set(srt->var_str2int, var_str_dup, ivar) < 0) + { + free(var_str_dup); + goto nomem; + } free(srt->var[ivar].str); // possible left-over from the previous position + srt->var[ivar].str = NULL; + srt->nvar++; } var_t *var = &srt->var[ivar]; var->nalt = line->n_allele - 1; var->type = bcf_get_variant_types(line); srt->str.s[var_end] = 0; if ( ret==-1 ) + { var->str = strdup(var_str); + if (!var->str) + goto nomem; + } int mvcf = var->mvcf; + if (hts_resize(int*, var->nvcf+1, &var->mvcf, &var->vcf, HTS_RESIZE_CLEAR)<0) + goto nomem; + if ( mvcf != var->mvcf ) + { + bcf1_t **new_rec = hts_realloc_p(var->rec, sizeof(bcf1_t*), var->mvcf); + if (!new_rec) + goto nomem; + var->rec = new_rec; + } + var->vcf[var->nvcf] = ireader; + var->rec[var->nvcf] = line; var->nvcf++; - hts_expand0(int*, var->nvcf, var->mvcf, var->vcf); - if ( mvcf != var->mvcf ) var->rec = hts_realloc_p(var->rec, sizeof(bcf1_t*), var->mvcf); - var->vcf[var->nvcf-1] = ireader; - var->rec[var->nvcf-1] = line; + if (hts_resize(var_t, grp.nvar+1, &grp.mvar, &grp.var, 0) < 0) + goto nomem; + grp.var[grp.nvar] = ivar; grp.nvar++; - hts_expand(var_t,grp.nvar,grp.mvar,grp.var); - grp.var[grp.nvar-1] = ivar; } char *grp_key = grp_create_key(srt); + if (!grp_key) + goto nomem; int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp); if ( ret==-1 ) { - igrp = srt->ngrp++; - hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp); + if (hts_resize(grp_t, srt->ngrp + 1, &srt->mgrp, &srt->grp, HTS_RESIZE_CLEAR) < 0) + { + free(grp_key); + goto nomem; + } + igrp = srt->ngrp; free(srt->grp[igrp].var); + srt->grp[igrp].var = NULL; srt->grp[igrp] = grp; srt->grp[igrp].key = grp_key; - khash_str2int_set(srt->grp_str2int, grp_key, igrp); + if (khash_str2int_set(srt->grp_str2int, grp_key, igrp) < 0) + { + free(grp_key); + goto nomem; + } + srt->ngrp++; memset(&grp,0,sizeof(grp_t)); } else @@ -494,10 +543,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, for (ivar=0; ivarnvar; ivar++) { if ( kbs_resize(&srt->var[ivar].mask, srt->ngrp) < 0 ) - { - fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__); - exit(1); - } + goto nomem; kbs_clear(srt->var[ivar].mask); } for (igrp=0; igrpngrp; igrp++) @@ -512,20 +558,18 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, // create the initial list of variant sets for (ivar=0; ivarnvar; ivar++) { + if (hts_resize(varset_t, srt->nvset + 1, &srt->mvset, &srt->vset, HTS_RESIZE_CLEAR) < 0) + goto nomem; ivset = srt->nvset++; - hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset); - varset_t *vset = &srt->vset[ivset]; + if (hts_resize(var_t, 1, &vset->mvar, &vset->var, HTS_RESIZE_CLEAR) < 0) + goto nomem; + vset->var[0] = ivar; vset->nvar = 1; - hts_expand0(var_t, vset->nvar, vset->mvar, vset->var); - vset->var[vset->nvar-1] = ivar; var_t *var = &srt->var[ivar]; vset->cnt = var->nvcf; if ( kbs_resize(&vset->mask, srt->ngrp) < 0 ) - { - fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__); - exit(1); - } + goto nomem; kbs_clear(vset->mask); kbs_bitwise_or(vset->mask, var->mask); @@ -545,8 +589,10 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, #endif // initialize the pairing matrix - hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat); - hts_expand(int, srt->nvset, srt->mcnt, srt->cnt); + if (hts_resize(int, srt->ngrp*srt->nvset, &srt->mpmat, &srt->pmat, 0) < 0) + goto nomem; + if (hts_resize(int, srt->nvset, &srt->mcnt, &srt->cnt, 0) < 0) + goto nomem; memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset); for (ivset=0; ivsetnvset; ivset++) { @@ -581,18 +627,28 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, if ( ipair!=-1 && ipair!=imax ) { imax = merge_vsets(srt, imax, ipair); + if (imax < 0) + goto nomem; continue; } - push_vset(srt, imax); + if (push_vset(srt, imax) < 0) + goto nomem; } srt->chr = chr; srt->pos = min_pos; - return 0; // FIXME: check for errs in this function + return 0; + + nomem: + readers->errnum = no_memory; + return -1; } +// Returns number of readers with the current line (may be 0) +// On error, returns 0 but readers->errnum will have been set + int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos) { int i,j; @@ -603,8 +659,14 @@ int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_po srt->sr = readers; if ( srt->nsr < readers->nreaders ) { - srt->vcf_buf = hts_realloc_p(srt->vcf_buf, sizeof(vcf_buf_t), - readers->nreaders); + vcf_buf_t *new_buf = hts_realloc_p(srt->vcf_buf, sizeof(vcf_buf_t), + readers->nreaders); + if (!new_buf) + { + readers->errnum = no_memory; + return 0; + } + srt->vcf_buf = new_buf; memset(srt->vcf_buf + srt->nsr, 0, sizeof(vcf_buf_t)*(readers->nreaders - srt->nsr)); if ( srt->msr < srt->nsr ) srt->msr = srt->nsr; } @@ -624,7 +686,13 @@ int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_po readers->has_line[srt->active[0]] = 1; return 1; } - if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos); + if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) + { + if (bcf_sr_sort_set(readers, srt, chr, min_pos) < 0) { + readers->errnum = no_memory; + return 0; + } + } if ( !srt->vcf_buf[0].nrec ) return 0; diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 38c26a868..c9b1c884e 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -352,32 +352,32 @@ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int HTSLIB_EXPORT int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); - - -/* +/** * bcf_sr_regions_init() - * @regions: regions can be either a comma-separated list of regions - * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or - * tab-delimited file (the default). Uncompressed files - * are stored in memory while bgzip-compressed and tabix-indexed - * region files are streamed. - * @is_file: 0: regions is a comma-separated list of regions - * (chr|chr:pos|chr:from-to|chr:from-) - * 1: VCF, BED or tab-delimited file - * @chr, from, to: - * Column indexes of chromosome, start position and end position - * in the tab-delimited file. The positions are 1-based and - * inclusive. - * These parameters are ignored when reading from VCF, BED or - * tabix-indexed files. When end position column is not present, - * supply 'from' in place of 'to'. When 'to' is negative, first - * abs(to) will be attempted and if that fails, 'from' will be used - * instead. - * If chromosome name contains the characters ':' or '-', it should - * be put in curly brackets, for example as "{weird-chr-name:1-2}:1000-2000" + * @param regions regions can be either a comma-separated list of regions + * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or + * tab-delimited file (the default). Uncompressed files + * are stored in memory while bgzip-compressed and tabix-indexed + * region files are streamed. + * @param is_file 0: regions is a comma-separated list of regions + * (chr|chr:pos|chr:from-to|chr:from-) + * 1: VCF, BED or tab-delimited file + * @param chr Column index of chromosome + * @param from Column index of start position (1-based, inclusive) + * @param to Column index of end position (1-based, inclusive) + * @return Pointer to a populated bcf_sr_regions_t struct on success + * NULL on failure * * The bcf_sr_regions_t struct returned by a successful call should be freed * via bcf_sr_regions_destroy() when it is no longer needed. + * + * These @p chr, @p from and @p tp parameters are ignored when reading + * from VCF, BED or tabix-indexed files. When end position column is not + * present, supply 'from' in place of 'to'. When 'to' is negative, first + * abs(to) will be attempted and if that fails, 'from' will be used instead. + * + * If chromosome name contains the characters ':' or '-', it should + * be put in curly brackets, for example as "{weird-chr-name:1-2}:1000-2000" */ HTSLIB_EXPORT bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int chr, int from, int to); @@ -394,12 +394,15 @@ void bcf_sr_regions_destroy(bcf_sr_regions_t *regions); HTSLIB_EXPORT int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr); -/* - * bcf_sr_regions_next() - retrieves next region. Returns 0 on success and -1 - * when all regions have been read. The fields reg->seq, reg->start and - * reg->end are filled with the genomic coordinates on success or with - * NULL,-1,-1 when no region is available. The coordinates are 0-based, - * inclusive. +/** + * Retrieve next region. + * @return 0 on success + * -1 when all regions have been read + * -2 on failure + * + * The fields reg->seq, reg->start and reg->end are filled with the + * genomic coordinates on success or with NULL,-1,-1 when no region is + * available. The coordinates are 0-based, inclusive. */ HTSLIB_EXPORT int bcf_sr_regions_next(bcf_sr_regions_t *reg); diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index cd39921f9..6353ea824 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -77,7 +77,7 @@ aux_t; static bcf_sr_regions_t *bcf_sr_regions_alloc(void); static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end); static bcf_sr_regions_t *_regions_init_string(const char *str); -static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); +static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec, bcf_sr_error *errnum); static void _regions_sort_and_merge(bcf_sr_regions_t *reg); static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end, int missed_reg_handler); static void bcf_sr_seek_start(bcf_srs_t *readers); @@ -175,7 +175,8 @@ static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters) else { str.l = 0; - kputsn(prev, tmp-prev, &str); + if (kputsn(prev, tmp-prev, &str) < 0) + goto err; out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s); if ( out[nout]>=0 ) nout++; } @@ -200,12 +201,21 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) { if ( readers->regions ) bcf_sr_regions_destroy(readers->regions); readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); + if (!readers->regions) + { + readers->errnum = bcf_sr_regions_error; + return -1; + } bcf_sr_seek_start(readers); return 0; } readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); - if ( !readers->regions ) return -1; + if ( !readers->regions ) + { + readers->errnum = bcf_sr_regions_error; + return -1; + } readers->explicit_regs = 1; readers->require_index = REQUIRE_IDX_; readers->regions->overlap = BCF_SR_AUX(readers)->regions_overlap; @@ -217,6 +227,7 @@ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int if ( readers->nreaders || readers->targets ) { hts_log_error("Must call bcf_sr_set_targets() before bcf_sr_add_reader()"); + readers->errnum = api_usage_error; return -1; } if ( targets[0]=='^' ) @@ -225,7 +236,11 @@ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int targets++; } readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2); - if ( !readers->targets ) return -1; + if ( !readers->targets ) + { + readers->errnum = bcf_sr_regions_error; + return -1; + } readers->targets_als = alleles; readers->targets->overlap = BCF_SR_AUX(readers)->targets_overlap; return 0; @@ -237,12 +252,18 @@ int bcf_sr_set_threads(bcf_srs_t *files, int n_threads) return 0; files->p = calloc(1, sizeof(*files->p)); - if (!files->p) { + if (!files->p) + { files->errnum = no_memory; return -1; } if (!(files->p->pool = hts_tpool_init(n_threads))) + { + files->errnum = no_memory; + free(files->p); + files->p = NULL; return -1; + } return 0; } @@ -254,6 +275,7 @@ void bcf_sr_destroy_threads(bcf_srs_t *files) { if (files->p->pool) hts_tpool_destroy(files->p->pool); free(files->p); + files->p = NULL; } int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) @@ -266,13 +288,14 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) vcf_open_mode(fmode+1, fname, NULL); htsFile* file_ptr = hts_open(fname, fmode); if ( ! file_ptr ) { - files->errnum = open_failed; + files->errnum = bcf_sr_open_failed; return 0; } //get idx name and pass to add_hreader idxname = strstr(fname, HTS_IDX_DELIM); if (idxname) idxname += strlen(HTS_IDX_DELIM); if (!(ret = bcf_sr_add_hreader(files, file_ptr, 1, idxname))) { + // bcf_sr_add_hreader() will have set files->errnum hts_close(file_ptr); //failed, close the file } return ret; @@ -280,17 +303,30 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) int bcf_sr_add_hreader(bcf_srs_t *files, htsFile *file_ptr, int autoclose, const char *idxname) { - aux_t *auxdata = NULL; + aux_t *auxdata = BCF_SR_AUX(files); + assert(auxdata != NULL); if ( ! file_ptr ) { - files->errnum = open_failed; + files->errnum = bcf_sr_open_failed; return 0; } - files->has_line = hts_realloc_ps(files->has_line, sizeof(*files->has_line), - files->nreaders, 1); + int *new_has_line = hts_realloc_ps(files->has_line, sizeof(*files->has_line), + files->nreaders, 1); + if (!new_has_line) + goto nomem; + files->has_line = new_has_line; files->has_line[files->nreaders] = 0; - files->readers = hts_realloc_ps(files->readers, sizeof(*files->readers), - files->nreaders, 1); + bcf_sr_t *new_readers = hts_realloc_ps(files->readers, sizeof(*files->readers), + files->nreaders, 1); + if (!new_readers) + goto nomem; + files->readers = new_readers; + int *new_closefile = hts_realloc_ps(auxdata->closefile, sizeof(int), + files->nreaders, 1); + if (!new_closefile) + goto nomem; + auxdata->closefile = new_closefile; + auxdata->closefile[files->nreaders] = 0; // Will be updated later bcf_sr_t *reader = &files->readers[files->nreaders++]; memset(reader,0,sizeof(bcf_sr_t)); @@ -392,9 +428,15 @@ int bcf_sr_add_hreader(bcf_srs_t *files, htsFile *file_ptr, int autoclose, const } reader->fname = strdup(file_ptr->fn); + if (!reader->fname) + goto nomem; + if ( files->apply_filters ) + { reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); - + if (!reader->filter_ids) + goto nomem; + } // Update list of chromosomes if ( !files->explicit_regs && !files->streaming ) { @@ -405,16 +447,17 @@ int bcf_sr_add_hreader(bcf_srs_t *files, htsFile *file_ptr, int autoclose, const { files->regions = bcf_sr_regions_alloc(); if ( !files->regions ) - { - hts_log_error("Cannot allocate regions data structure"); - return 0; - } + goto nomem; } names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); for (i=0; iregions, names[i], -1, -1); + if (_regions_add(files->regions, names[i], -1, -1) < 0) + { + free(names); + goto nomem; + } } free(names); _regions_sort_and_merge(files->regions); @@ -442,26 +485,32 @@ int bcf_sr_add_hreader(bcf_srs_t *files, htsFile *file_ptr, int autoclose, const } } - if ((auxdata = BCF_SR_AUX(files))) { - //store closure status for htsfile - int *tmp = hts_realloc_p(auxdata->closefile, sizeof(int), files->nreaders); - if (!tmp) { - hts_log_error("Failed to allocate memory"); - return 0; - } - tmp[files->nreaders - 1] = autoclose; - auxdata->closefile = tmp; - } + // Take ownership of the input file + auxdata->closefile[files->nreaders - 1] = autoclose; return 1; + + nomem: + files->errnum = no_memory; + return 0; } bcf_srs_t *bcf_sr_init(void) { bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t)); + if (!files) + return NULL; files->aux = (aux_t*) calloc(1,sizeof(aux_t)); + if (!files->aux) + { + free(files); + return NULL; + } + + // sr_sort_t struct already allocated, so this cannot fail bcf_sr_sort_init(&BCF_SR_AUX(files)->sort); + bcf_sr_set_opt(files,BCF_SR_REGIONS_OVERLAP,1); bcf_sr_set_opt(files,BCF_SR_TARGETS_OVERLAP,0); return files; @@ -469,6 +518,8 @@ bcf_srs_t *bcf_sr_init(void) static void bcf_sr_destroy1(bcf_sr_t *reader, int closefile) { + if (!reader) + return; free(reader->fname); if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); @@ -488,6 +539,9 @@ static void bcf_sr_destroy1(bcf_sr_t *reader, int closefile) void bcf_sr_destroy(bcf_srs_t *files) { + if (!files) + return; + int i; int *autoclose = BCF_SR_AUX(files)->closefile; @@ -512,6 +566,9 @@ void bcf_sr_remove_reader(bcf_srs_t *files, int i) assert( !files->samples ); // not ready for this yet int *autoclose = BCF_SR_AUX(files)->closefile; + if (i < 0 || i >= files->nreaders) + return; + bcf_sr_sort_remove_reader(files, &BCF_SR_AUX(files)->sort, i); bcf_sr_destroy1(&files->readers[i], autoclose[i]); if ( i+1 < files->nreaders ) @@ -566,12 +623,16 @@ static inline int has_filter(bcf_sr_t *reader, bcf1_t *line) return 0; } +// Reset a reader and make it iterate over the range seq:start-end +// Returns 0 on success +// -1 if seq isn't in the index +// -2 if it couldn't make a new iterator static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_pos_t end) { if ( end>=MAX_CSI_COOR ) { hts_log_error("The coordinate is out of csi index limit: %"PRIhts_pos, end+1); - exit(1); + return -2; } if ( reader->itr ) { @@ -593,14 +654,16 @@ static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_ } if (!reader->itr) { hts_log_error("Could not seek: %s:%"PRIhts_pos"-%"PRIhts_pos, seq, start + 1, end + 1); - abort(); + return -2; } return 0; } /* * _readers_next_region() - jumps to next region if necessary - * Returns 0 on success or -1 when there are no more regions left + * Returns 0 on success + * -1 when there are no more regions left + * -2 on error (also sets files->errnum) */ static int _readers_next_region(bcf_srs_t *files) { @@ -618,11 +681,25 @@ static int _readers_next_region(bcf_srs_t *files) // No lines in the buffer, need to open new region or quit. int prev_iseq = files->regions->iseq; hts_pos_t prev_end = files->regions->end; - if ( bcf_sr_regions_next(files->regions)<0 ) return -1; + int res = bcf_sr_regions_next(files->regions); + if ( res<0 ) + { + if (res < -1) files->errnum = bcf_sr_seek_error; + return res; + } files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : -1; for (i=0; inreaders; i++) - _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end); + { + res = _reader_seek(&files->readers[i], + files->regions->seq_names[files->regions->iseq], + files->regions->start,files->regions->end); + if (res < -1) + { + files->errnum = bcf_sr_seek_error; + return res; + } + } return 0; } @@ -658,6 +735,7 @@ static void _set_variant_boundaries(bcf1_t *rec, hts_pos_t *beg, hts_pos_t *end) /* * _reader_fill_buffer() - buffers all records with the same coordinate + * returns 0 on success, -1 on failure (also sets files->errnum) */ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { @@ -669,54 +747,71 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) // Fill the buffer with records starting at the same position int i, ret = 0; + bcf_sr_error err = bcf_sr_ok; while (1) { - if ( reader->nbuffer+1 >= reader->mbuffer ) + if ( reader->mbuffer - reader->nbuffer <= 1 ) { // Increase buffer size + bcf1_t **new_buf = hts_realloc_ps(reader->buffer, sizeof(bcf1_t*), + reader->mbuffer, 8); + if (!new_buf) + { + err = files->errnum = no_memory; + break; + } + reader->buffer = new_buf; reader->mbuffer += 8; - reader->buffer = hts_realloc_p(reader->buffer, sizeof(bcf1_t*), reader->mbuffer); for (i=8; i>0; i--) // initialize { reader->buffer[reader->mbuffer-i] = bcf_init1(); + if (!reader->buffer[reader->mbuffer-i]) + break; reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 } + if (i > 0) + { + reader->mbuffer -= i; // Adjust to number that got initialized + err = files->errnum = no_memory; + break; + } } if ( files->streaming ) { if ( reader->file->format.format==vcf ) { ret = hts_getline(reader->file, KS_SEP_LINE, &files->tmps); - if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < -1 ) err = files->errnum = bcf_read_error; if ( ret < 0 ) break; // no more lines or an error ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); - if ( ret<0 ) { files->errnum = vcf_parse_error; break; } + if ( ret<0 ) { err = files->errnum = vcf_parse_error; break; } } else if ( reader->file->format.format==bcf ) { ret = bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1]); - if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < -1 ) err = files->errnum = bcf_read_error; if ( ret < 0 ) break; // no more lines or an error } else { - hts_log_error("Fixme: not ready for this"); - exit(1); + hts_log_error("Unsupported file type for streaming"); + err = files->errnum = file_type_error; + break; } } else if ( reader->tbx_idx ) { ret = tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps); - if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < -1 ) err = files->errnum = bcf_read_error; if ( ret < 0 ) break; // no more lines or an error ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); - if ( ret<0 ) { files->errnum = vcf_parse_error; break; } + if ( ret<0 ) { err = files->errnum = vcf_parse_error; break; } } else { ret = bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1]); - if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < -1 ) err = files->errnum = bcf_read_error; if ( ret < 0 ) break; // no more lines or an error bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } @@ -737,8 +832,10 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) _set_variant_boundaries(reader->buffer[reader->nbuffer+1], &beg,&end); else { - hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); - exit(1); + hts_log_error("Invalid BCF_SR_REGIONS_OVERLAP value: %d", + BCF_SR_AUX(files)->regions_overlap); + err = files->errnum = api_usage_error; + break; } if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue; } @@ -765,9 +862,9 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) if ( files->require_index==ALLOW_NO_IDX_ && reader->buffer[reader->nbuffer]->rid < reader->buffer[1]->rid ) { hts_log_error("Sequences out of order, cannot stream multiple unindexed files: %s", reader->fname); - exit(1); + err = files->errnum = noidx_error; } - return 0; // FIXME: Check for more errs in this function + return err == bcf_sr_ok ? 0 : -1; } /* @@ -785,6 +882,9 @@ static void _reader_shift_buffer(bcf_sr_t *reader) reader->nbuffer--; } +// Returns number of active readers (may be 0) +// On error, returns 0 but readers->errnum will have been set + static int next_line(bcf_srs_t *files) { const char *chr = NULL; @@ -794,13 +894,18 @@ static int next_line(bcf_srs_t *files) while ( 1 ) { // Get all readers ready for the next region. - if ( files->regions && _readers_next_region(files)<0 ) break; - + if ( files->regions ) + { + int r = _readers_next_region(files); + if ( r < -1 ) return 0; // Will have set files->errnum + if ( r < 0 ) break; + } // Fill buffers and find the minimum chromosome int i, min_rid = INT32_MAX; for (i=0; inreaders; i++) { - _reader_fill_buffer(files, &files->readers[i]); + if (_reader_fill_buffer(files, &files->readers[i]) < 0) + return 0; // Will have set files->errnum if ( files->require_index==ALLOW_NO_IDX_ ) { if ( !files->readers[i].nbuffer ) continue; @@ -818,11 +923,25 @@ static int next_line(bcf_srs_t *files) { min_pos = files->readers[i].buffer[1]->pos; chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]); - assert(chr); - bcf_sr_sort_set_active(&BCF_SR_AUX(files)->sort, i); + if (!chr) + { + files->errnum = bcf_read_error; + return 0; + } + if (bcf_sr_sort_set_active(&BCF_SR_AUX(files)->sort, i) < 0) + { + files->errnum = no_memory; + return 0; + } } else if ( min_pos==files->readers[i].buffer[1]->pos ) - bcf_sr_sort_add_active(&BCF_SR_AUX(files)->sort, i); + { + if (bcf_sr_sort_add_active(&BCF_SR_AUX(files)->sort, i) < 0) + { + files->errnum = no_memory; + return 0; + } + } } if ( min_pos==HTS_POS_MAX ) { @@ -849,10 +968,17 @@ static int next_line(bcf_srs_t *files) _set_variant_boundaries(files->readers[i].buffer[1], &beg,&end); else { - hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); - exit(1); + hts_log_error("Invalid BCF_SR_TARGETS_OVERLAP value: %d", + BCF_SR_AUX(files)->targets_overlap); + files->errnum = api_usage_error; + return 0; } int overlap = bcf_sr_regions_overlap(files->targets, chr, beg, end)==0 ? 1 : 0; + if (overlap < -1) + { + files->errnum = no_memory; + return 0; + } if ( (!files->targets_exclude && !overlap) || (files->targets_exclude && overlap) ) _reader_shift_buffer(&files->readers[i]); else @@ -885,7 +1011,8 @@ int bcf_sr_next_line(bcf_srs_t *files) for (i=0; inreaders; i++) if ( files->has_line[i] ) break; - if ( _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0]) ) return ret; + int r = _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0], &files->errnum); + if ( r ) return r > 0 ? ret : 0; // Check if there are more duplicate lines in the buffers. If not, return this line as if it // matched the targets, even if there is a type mismatch @@ -934,11 +1061,22 @@ int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos) // - position regions to the requested position (bcf_sr_regions_overlap) bcf_sr_seek_start(readers); if ( khash_str2int_get(readers->regions->seq_hash, seq, &i)>=0 ) readers->regions->iseq = i; - _bcf_sr_regions_overlap(readers->regions, seq, pos, pos, 0); + int r = _bcf_sr_regions_overlap(readers->regions, seq, pos, pos, 0); + if (r < -2) + { + readers->errnum = bcf_sr_seek_error; + return 0; + } for (i=0; inreaders; i++) { - nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1); + r = _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1); + if (r < -1) + { + readers->errnum = bcf_sr_seek_error; + return 0; + } + nret += r >= 0 ? r : 0; } return nret; } @@ -948,21 +1086,36 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) int i, j, nsmpl, free_smpl = 0; char **smpl = NULL; - void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL; + void *exclude = NULL; + + if (fname[0]=='^') + { + exclude = khash_str2int_init(); + if (!exclude) + { + files->errnum = no_memory; + return 0; + } + } if ( exclude || strcmp("-",fname) ) // "-" stands for all samples { smpl = hts_readlist(fname, is_file, &nsmpl); if ( !smpl ) { hts_log_error("Could not read the file: \"%s\"", fname); + khash_str2int_destroy(exclude); + files->errnum = bcf_sr_samples_error; return 0; } + free_smpl = 1; if ( exclude ) { for (i=0; isamples = hts_realloc_ps(files->samples, sizeof(const char*), - files->n_smpl, 1); - files->samples[files->n_smpl++] = strdup(smpl[i]); + char **new_samples = hts_realloc_ps(files->samples, sizeof(char *), + files->n_smpl, 1); + if (!new_samples) + goto nomem; + files->samples = new_samples; + if ( (files->samples[files->n_smpl] = strdup(smpl[i])) == NULL ) + goto nomem; + files->n_smpl++; } if ( exclude ) khash_str2int_destroy(exclude); @@ -999,6 +1157,7 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) { for (i=0; in_smpl ) @@ -1011,11 +1170,26 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) { bcf_sr_t *reader = &files->readers[i]; reader->samples = hts_malloc_p(sizeof(int), files->n_smpl); + if (!reader->samples) + { + files->errnum = no_memory; + return 0; + } reader->n_smpl = files->n_smpl; for (j=0; jn_smpl; j++) reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]); } return 1; + + nomem: + if ( exclude ) khash_str2int_destroy(exclude); + if ( free_smpl ) + { + for (i=0; ierrnum = no_memory; + return 0; } // Allocate a new region list structure. @@ -1031,6 +1205,7 @@ static bcf_sr_regions_t *bcf_sr_regions_alloc(void) // Add a new region into a list. On input the coordinates are 1-based, inclusive, then stored 0-based, // inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions(). +// Returns 0 on success, -1 on failure static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end) { if ( start==-1 && end==-1 ) @@ -1043,28 +1218,52 @@ static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, } if ( !reg->seq_hash ) + { reg->seq_hash = khash_str2int_init(); + if ( !reg->seq_hash ) + goto nomem; + } int iseq; if ( khash_str2int_get(reg->seq_hash, chr, &iseq)<0 ) { // the chromosome block does not exist - iseq = reg->nseqs++; - reg->seq_names = hts_realloc_p(reg->seq_names,sizeof(char*),reg->nseqs); - reg->regs = hts_realloc_p(reg->regs,sizeof(region_t),reg->nseqs); - memset(®->regs[reg->nseqs-1],0,sizeof(region_t)); + char **new_seq_names = hts_realloc_ps(reg->seq_names, + sizeof(char*), reg->nseqs, 1); + if (!new_seq_names) + goto nomem; + reg->seq_names = new_seq_names; + region_t *new_regs = hts_realloc_ps(reg->regs,sizeof(region_t), + reg->nseqs, 1); + if (!new_regs) + goto nomem; + reg->regs = new_regs; + + iseq = reg->nseqs; + memset(®->regs[iseq],0,sizeof(region_t)); reg->seq_names[iseq] = strdup(chr); + if (!reg->seq_names[iseq]) + goto nomem; reg->regs[iseq].creg = -1; - khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq); + if (khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq) < 0) + { + free(reg->seq_names[iseq]); + goto nomem; + } + reg->nseqs++; } region_t *creg = ®->regs[iseq]; - hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs); + if (hts_resize(region1_t,creg->nregs+1,&creg->mregs,&creg->regs,0) < 0) + goto nomem; creg->regs[creg->nregs].start = start; creg->regs[creg->nregs].end = end; creg->nregs++; - return 0; // FIXME: check for errs in this function + return 0; + + nomem: + return -1; } static int regions_cmp(const void *aptr, const void *bptr) @@ -1108,10 +1307,15 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg) // Recognises regions in the form chr, chr:pos, chr:beg-end, chr:beg-, {weird-chr-name}:pos. // Cannot use hts_parse_region() as that requires the header and if header is not present, // wouldn't learn the chromosome name. +// Returns populated bcf_sr_regions_t struct on success; NULL on failure static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = bcf_sr_regions_alloc(); - if ( !reg ) return NULL; + if ( !reg ) + { + hts_log_error("Out of memory"); + return NULL; + } kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; @@ -1128,12 +1332,14 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) goto exit_nicely; } ep++; - kputsn(sp+1,ep-sp-2,&tmp); + if (kputsn(sp+1,ep-sp-2,&tmp) < 0) + goto exit_nicely_nomem; } else { while ( *ep && *ep!=',' && *ep!=':' ) ep++; - kputsn(sp,ep-sp,&tmp); + if (kputsn(sp,ep-sp,&tmp) < 0) + goto exit_nicely_nomem; } if ( *ep==':' ) { @@ -1146,7 +1352,8 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } if ( !*ep || *ep==',' ) { - _regions_add(reg, tmp.s, from, from); + if (_regions_add(reg, tmp.s, from, from) < 0) + goto exit_nicely_nomem; sp = ep; continue; } @@ -1164,13 +1371,18 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) goto exit_nicely; } if ( sp==ep ) to = MAX_CSI_COOR-1; - _regions_add(reg, tmp.s, from, to); + if (_regions_add(reg, tmp.s, from, to) < 0) + goto exit_nicely_nomem; if ( !*ep ) break; sp = ep; } else if ( !*ep || *ep==',' ) { - if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); + if ( tmp.l ) + { + if (_regions_add(reg, tmp.s, -1, -1) < 0) + goto exit_nicely_nomem; + } if ( !*ep ) break; sp = ++ep; } @@ -1183,6 +1395,9 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) free(tmp.s); return reg; +exit_nicely_nomem: + hts_log_error("Out of memory"); + exit_nicely: bcf_sr_regions_destroy(reg); free(tmp.s); @@ -1254,9 +1469,15 @@ static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char ** bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito) { bcf_sr_regions_t *reg; + + if (!regions) + return NULL; + if ( !is_file ) { reg = _regions_init_string(regions); + if (!reg) + return NULL; _regions_sort_and_merge(reg); return reg; } @@ -1278,12 +1499,13 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr size_t iline = 0; int len = strlen(regions); int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1; + int r; if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1; if ( reg->file->format.format==vcf ) ito = 1; // read the whole file, tabix index is not present - while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) + while ( ( r = hts_getline(reg->file, KS_SEP_LINE, ®->line) ) > 0 ) { iline++; char *chr, *chr_end; @@ -1298,7 +1520,7 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr { hts_log_error("Could not parse %zu-th line of file %s, using the columns %d,%d[,%d]", iline, regions,ichr+1,ifrom+1,ito+1); - hts_close(reg->file); reg->file = NULL; free(reg); + bcf_sr_regions_destroy(reg); return NULL; } ito = ifrom; @@ -1308,9 +1530,17 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr if ( !ret ) continue; if ( is_bed ) from++; *chr_end = 0; - _regions_add(reg, chr, from, to); + if (_regions_add(reg, chr, from, to) < 0) + goto nomem; *chr_end = '\t'; } + if ( r < -1 ) + { + hts_log_error("Error reading file: \"%s\" : %s", + regions, strerror(errno)); + bcf_sr_regions_destroy(reg); + return NULL; + } hts_close(reg->file); reg->file = NULL; if ( !reg->nseqs ) { free(reg); return NULL; } _regions_sort_and_merge(reg); @@ -1319,15 +1549,27 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs); if ( !reg->seq_hash ) + { reg->seq_hash = khash_str2int_init(); + if (!reg->seq_hash) + goto nomem; + } int i; for (i=0; inseqs; i++) { - khash_str2int_set(reg->seq_hash,reg->seq_names[i],i); + if (khash_str2int_set(reg->seq_hash,reg->seq_names[i],i) < 0) + goto nomem; } reg->fname = strdup(regions); + if (!reg->fname) + goto nomem; reg->is_bin = 1; return reg; + + nomem: + hts_log_error("Out of memory"); + bcf_sr_regions_destroy(reg); + return NULL; } void bcf_sr_regions_destroy(bcf_sr_regions_t *reg) @@ -1426,7 +1668,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) { // tabix index present, reading a chromosome block ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, ®->line); - if ( ret<0 ) { reg->iseq = -1; return -1; } + if ( ret<0 ) { reg->iseq = -1; return ret; } } else { @@ -1441,21 +1683,27 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) hts_log_error("Could not open file: %s", reg->fname); reg->file = NULL; bcf_sr_regions_destroy(reg); - return -1; + return -2; } reg->is_bin = 0; } // tabix index absent, reading the whole file ret = reg->file ? hts_getline(reg->file, KS_SEP_LINE, ®->line) : -1; - if ( ret<0 ) { reg->iseq = -1; return -1; } + if ( ret<-1) + { + hts_log_error("Error reading \"%s\": %s\n", + reg->fname, strerror(errno)); + return -2; + } + if ( ret<0 ) { reg->iseq = -1; return ret; } } ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to); if ( ret<0 ) { hts_log_error("Could not parse the file %s, using the columns %d,%d,%d", reg->fname,ichr+1,ifrom+1,ito+1); - return -1; + return -2; } } if ( is_bed ) from++; @@ -1465,7 +1713,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) { hts_log_error("Broken tabix index? The sequence \"%s\" not in dictionary [%s]", chr, reg->line.s); - exit(1); + return -2; } *chr_end = '\t'; @@ -1474,13 +1722,14 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) return 0; } -static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec) +static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec, bcf_sr_error *errnum) { if ( reg->regs ) { // payload is not supported for in-memory regions, switch to regidx instead in future hts_log_error("Compressed and indexed targets file is required"); - exit(1); + *errnum = api_usage_error; + return -1; } int i = 0, max_len = 0; @@ -1499,9 +1748,11 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re if ( *se==',' ) reg->nals++; se++; } - ks_resize(®->als_str, se-ss+1+reg->nals); + if (ks_resize(®->als_str, se-ss+1+reg->nals) < 0) + goto nomem; reg->als_str.l = 0; - hts_expand(char*,reg->nals,reg->mals,reg->als); + if (hts_resize(char*,reg->nals,®->mals,®->als, 0) < 0) + goto nomem; reg->nals = 0; se = ss; @@ -1510,14 +1761,16 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re if ( *se=='\t' ) break; if ( *se!=',' ) continue; reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; - kputsn(ss,se-ss,®->als_str); + if (kputsn(ss,se-ss,®->als_str) < 0) + goto nomem; if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; reg->als_str.l++; reg->nals++; ss = ++se; } reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; - kputsn(ss,se-ss,®->als_str); + if (kputsn(ss,se-ss,®->als_str) < 0) + goto nomem; if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; reg->nals++; reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types @@ -1526,6 +1779,10 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re if ( reg->als_type & VCF_INDEL ) return type & VCF_INDEL ? 1 : 0; return !(type & VCF_INDEL) ? 1 : 0; + + nomem: + *errnum = no_memory; + return -1; } int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end) @@ -1543,9 +1800,12 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p { // flush regions left on previous chromosome if ( missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) - bcf_sr_regions_flush(reg); - - bcf_sr_regions_seek(reg, seq); + { + if ( bcf_sr_regions_flush(reg) < 0 ) + return -3; + } + if ( bcf_sr_regions_seek(reg, seq) < 0 ) + return -3; reg->start = reg->end = -1; } if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome @@ -1554,7 +1814,9 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p while ( iseq==reg->iseq && reg->end < start ) { - if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left + int r = bcf_sr_regions_next(reg); + if ( r < -1 ) return -3; // Something went wrong + if ( r < 0 ) return -2; // no more regions left if ( reg->iseq != iseq ) return -1; // does not overlap any regions if ( missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data); } @@ -1564,8 +1826,10 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p int bcf_sr_regions_flush(bcf_sr_regions_t *reg) { + int r; if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return 0; - while ( !bcf_sr_regions_next(reg) ) reg->missed_reg_handler(reg, reg->missed_reg_data); - return 0; // FIXME: check for errs in this function + while ( (r = bcf_sr_regions_next(reg)) == 0 ) + reg->missed_reg_handler(reg, reg->missed_reg_data); + return r > -2 ? 0 : -1; } From fd2d45668c8753ff1cdedf4a5678bb270fa40e37 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 28 May 2026 12:41:37 +0100 Subject: [PATCH 3/3] Improve synced reader documentation Ensure return values are documented. Convert markup to better match doxygen format. Signed-off-by: Rob Davies --- htslib/synced_bcf_reader.h | 151 +++++++++++++++++++++++++------------ 1 file changed, 103 insertions(+), 48 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index c9b1c884e..0599f5353 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -207,7 +207,7 @@ typedef struct bcf_srs_t bcf_srs_t; /** Allocate and initialize a bcf_srs_t struct. - * + * @return Pointer to a bcf_srs_t struct on success; NULL on failure. * The bcf_srs_t struct returned by a successful call should be freed * via bcf_sr_destroy() when it is no longer needed. */ @@ -218,18 +218,32 @@ bcf_srs_t *bcf_sr_init(void); HTSLIB_EXPORT void bcf_sr_destroy(bcf_srs_t *readers); + +/** Return a string describing a synced reader error + * @param errnum Error code (see enum bcf_sr_error) + * @return a char pointer to the error string + * @note The string returned should be treated as const. The caller should + * not try to edit or free it. + */ + HTSLIB_EXPORT char *bcf_sr_strerror(int errnum); +/** Set an option in the synced reader + * @param readers Synced reader struct + * @param opt Option to set + * @return 0 on success, 1 if the option was not recognised + */ + HTSLIB_EXPORT int bcf_sr_set_opt(bcf_srs_t *readers, bcf_sr_opt_t opt, ...); /** - * bcf_sr_set_threads() - allocates a thread-pool for use by the synced reader. - * @n_threads: size of thread pool + * Allocate a thread-pool for use by the synced reader. + * @param n_threads size of thread pool * - * Returns 0 if the call succeeded, or <0 on error. + * @return 0 if the call succeeded, or <0 on error. */ HTSLIB_EXPORT int bcf_sr_set_threads(bcf_srs_t *files, int n_threads); @@ -239,11 +253,11 @@ HTSLIB_EXPORT void bcf_sr_destroy_threads(bcf_srs_t *files); /** - * bcf_sr_add_reader() - open new reader - * @readers: holder of the open readers - * @fname: the VCF file + * Open new reader + * @param readers holder of the open readers + * @param fname the VCF file * - * Returns 1 if the call succeeded, or 0 on error. + * @return 1 if the call succeeded, or 0 on error. * * See also the bcf_srs_t data structure for parameters controlling * the reader's logic. @@ -253,19 +267,26 @@ HTSLIB_EXPORT int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname); /** - * bcf_sr_add_hreader() - open new reader using htsfile - * @readers: holder of the open readers - * @file_ptr: htsfile already opened - * @autoclose: close file along with reader or not, 1 - close, 0 - do not close - * @idxname: index file name for file in @file_ptr + * Open new reader using htsfile + * @param readers holder of the open readers + * @param file_ptr htsfile already opened + * @param autoclose close file along with reader or not, 1 - close, 0 - do not close + * @param idxname index file name for file in @file_ptr * - * Returns 1 if the call succeeded, or 0 on error. + * @return 1 if the call succeeded, or 0 on error. * * See also the bcf_srs_t data structure for parameters controlling * the reader's logic. * If idxname is NULL, uses file_ptr->fn to find index file. * With idxname as NULL, index file must be present along with the file with * default name + * + * If the call succeeds, and @p autoclose is 1, ownership of @p file_ptr + * passes to @p readers, and the file will be closed automatically when either + * the reader is removed by bcf_sr_remove_reader(), or when all readers + * are closed by bcf_sr_destroy(). If this function fails, ownership of + * the file remains with the caller and the caller is responsible for cleaning + * it up. */ HTSLIB_EXPORT int bcf_sr_add_hreader(bcf_srs_t *readers, htsFile *file_ptr, int autoclose, @@ -275,12 +296,18 @@ HTSLIB_EXPORT void bcf_sr_remove_reader(bcf_srs_t *files, int i); /** - * bcf_sr_next_line() - the iterator - * @readers: holder of the open readers + * Synced reader iterator + * @param readers holder of the open readers + * @return Number of readers with current line on success + * 0 when finished, or on error * * Returns the number of readers which have the current line * (bcf_sr_t.buffer[0]) set at this position. Use the bcf_sr_has_line macro to * determine which of the readers are set. + * + * A return value of 0 indicates either that the iterator finished, or + * there was an error. Callers must check @p readers->errnum to tell + * these states apart. */ HTSLIB_EXPORT int bcf_sr_next_line(bcf_srs_t *readers); @@ -294,43 +321,47 @@ int bcf_sr_next_line(bcf_srs_t *readers); /** - * bcf_sr_seek() - set all readers to selected position - * @seq: sequence name; NULL to seek to start - * @pos: 0-based coordinate + * Set all readers to selected position + * @param seq sequence name; NULL to seek to start + * @param pos 0-based coordinate + * @return Number of readers with iterators over the new sequence on success + * 0 on failure + * + * @note Callers must check @p readers->errnum to detect failures */ HTSLIB_EXPORT int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos); /** - * bcf_sr_set_samples() - sets active samples - * @readers: holder of the open readers - * @samples: this can be one of: file name with one sample per line; - * or column-separated list of samples; or '-' for a list of - * samples shared by all files. If first character is the - * exclamation mark, all but the listed samples are included. - * @is_file: 0: list of samples; 1: file with sample names + * Set active samples + * @param readers holder of the open readers + * @param samples this can be one of: file name with one sample per line; + * or column-separated list of samples; or '-' for a list of + * samples shared by all files. If first character is the + * exclamation mark, all but the listed samples are included. + * @param is_file 0: list of samples; 1: file with sample names * - * Returns 1 if the call succeeded, or 0 on error. + * @return 1 if the call succeeded, or 0 on error. */ HTSLIB_EXPORT int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); -/** - * bcf_sr_set_targets(), bcf_sr_set_regions() - init targets/regions - * @readers: holder of the open readers - * @targets: list of regions, one-based and inclusive. - * @is_fname: 0: targets is a comma-separated list of regions (chr,chr:from-to) - * 1: targets is a tabix indexed file with a list of regions - * ( or ) - * - * Returns 0 if the call succeeded, or -1 on error. +/** Set targets filter + * @param readers holder of the open readers + * @param targets list of regions, one-based and inclusive. + * @param is_fname 0: targets is a comma-separated list of regions (chr,chr:from-to) + * 1: targets is a tabix indexed file with a list of regions + * ( or ) + * @param alleles One-based index of column listing alleles (see below) + * @return 0 if the call succeeded, or -1 on error. * - * Both functions behave the same way, unlisted positions will be skipped by - * bcf_sr_next_line(). However, there is an important difference: regions use + * Both bcf_sr_set_targets() and bcf_sr_set_regions() behave the same way, + * unlisted positions will be skipped by bcf_sr_next_line(). + * However, there is an important difference: regions uses an * index to jump to desired positions while targets streams the whole files * and merely skip unlisted positions. * - * Moreover, bcf_sr_set_targets() accepts an optional parameter $alleles which + * Moreover, bcf_sr_set_targets() accepts an optional parameter @p alleles which * is interpreted as a 1-based column index in the tab-delimited file where * alleles are listed. This in principle enables to perform the COLLAPSE_* * logic also with tab-delimited files. However, the current implementation @@ -349,11 +380,29 @@ int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); HTSLIB_EXPORT int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles); +/** Set region list + * @param readers holder of the open readers + * @param targets list of regions, one-based and inclusive. + * @param is_fname 0: targets is a comma-separated list of regions (chr,chr:from-to) + * 1: targets is a tabix indexed file with a list of regions + * ( or ) + * @return 0 if the call succeeded, or -1 on error. + * + * Both bcf_sr_set_targets() and bcf_sr_set_regions() behave the same way, + * unlisted positions will be skipped by bcf_sr_next_line(). + * However, there is an important difference: regions uses an + * index to jump to desired positions while targets streams the whole files + * and merely skip unlisted positions. + * + * API notes: + * - calling bcf_sr_set_regions AFTER readers have been initialized will + * reposition the readers and discard all previous regions. + */ HTSLIB_EXPORT int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); /** - * bcf_sr_regions_init() + * Create a regions list * @param regions regions can be either a comma-separated list of regions * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or * tab-delimited file (the default). Uncompressed files @@ -386,8 +435,9 @@ HTSLIB_EXPORT void bcf_sr_regions_destroy(bcf_sr_regions_t *regions); /* - * bcf_sr_regions_seek() - seek to the chromosome block - * + * Seek regions to the chromosome block + * @param regions Pointer to regions list + * @param chr Chromosome to seek to * Returns 0 on success or -1 on failure. Sets reg->seq appropriately and * reg->start,reg->end to -1. */ @@ -407,14 +457,19 @@ int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr); HTSLIB_EXPORT int bcf_sr_regions_next(bcf_sr_regions_t *reg); -/* - * bcf_sr_regions_overlap() - checks if the interval overlaps any of +/** Check if interval seq:start-end overlaps any regions + * @param reg Regions list + * @param seq Reference name + * @param start Start of interval in seq (0-based, inclusive) + * @param end End of interval in seq (0-based, inclusive) + * @return 0 if the position is in regions + * -1 if the position is not in the regions and more regions exist + * -2 if not in the regions and there are no more regions left + * -3 on failure + * + * Checks if the interval overlaps any of * the regions, the coordinates are 0-based, inclusive. The coordinate queries * must come in ascending order. - * - * Returns 0 if the position is in regions; -1 if the position is not in the - * regions and more regions exist; -2 if not in the regions and there are no more - * regions left. */ HTSLIB_EXPORT int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end);