diff --git a/bcf_sr_sort.c b/bcf_sr_sort.c index 9f5e6f3a1..ad0d1813b 100644 --- a/bcf_sr_sort.c +++ b/bcf_sr_sort.c @@ -107,8 +107,8 @@ static int multi_is_exact(var_t *avar, var_t *bvar) { if ( avar->nalt != bvar->nalt ) return 0; - int alen = strlen(avar->str); - int blen = strlen(bvar->str); + size_t alen = strlen(avar->str); + size_t blen = strlen(bvar->str); if ( alen != blen ) return 0; char *abeg = avar->str; @@ -214,11 +214,15 @@ static int merge_vsets(sr_sort_t *srt, int ivset, int jvset) varset_t *iv = &srt->vset[ivset]; varset_t *jv = &srt->vset[jvset]; + if (INT_MAX - iv->nvar < jv->nvar) + return -1; + kbs_bitwise_or(iv->mask,jv->mask); i = iv->nvar; + if (hts_resize(int, iv->nvar + jv->nvar, &iv->mvar, &iv->var, 0) < 0) + return -1; iv->nvar += jv->nvar; - hts_expand(int, iv->nvar, iv->mvar, iv->var); for (j=0; jnvar; j++,i++) iv->var[i] = jv->var[j]; int *imat = srt->pmat + ivset*srt->ngrp; @@ -238,9 +242,10 @@ static int push_vset(sr_sort_t *srt, int ivset) for (i=0; isr->nreaders; i++) { vcf_buf_t *buf = &srt->vcf_buf[i]; + if (hts_resize(bcf1_t*,buf->nrec + 1,&buf->mrec,&buf->rec,0) < 0) + return -1; + buf->rec[buf->nrec] = NULL; buf->nrec++; - hts_expand(bcf1_t*,buf->nrec,buf->mrec,buf->rec); - buf->rec[buf->nrec-1] = NULL; } for (i=0; invar; i++) { @@ -253,7 +258,7 @@ static int push_vset(sr_sort_t *srt, int ivset) } } remove_vset(srt, ivset); - return 0; // FIXME: check for errs in this function + return 0; } static int cmpstringp(const void *p1, const void *p2) @@ -303,9 +308,11 @@ void debug_vbuf(sr_sort_t *srt) static char *grp_create_key(sr_sort_t *srt) { - if ( !srt->str.l ) return strdup(""); - int i; - hts_expand(char*,srt->noff,srt->mcharp,srt->charp); + if ( !srt->str.l || srt->noff <= 0) return strdup(""); + size_t i; + if (hts_resize(char*,srt->noff,&srt->mcharp,&srt->charp,0) < 0) { + return NULL; + } for (i=0; inoff; i++) { srt->charp[i] = srt->str.s + srt->off[i]; @@ -317,27 +324,27 @@ static char *grp_create_key(sr_sort_t *srt) return NULL; for (i=0; inoff; i++) { - int len = strlen(srt->charp[i]); + size_t len = strlen(srt->charp[i]); memcpy(ptr, srt->charp[i], len); + ptr[len] = i+1==srt->noff ? 0 : ';'; ptr += len + 1; - ptr[-1] = i+1==srt->noff ? 0 : ';'; } return ret; } -int bcf_sr_sort_set_active(sr_sort_t *srt, int idx) -{ - hts_expand(int,idx+1,srt->mactive,srt->active); - srt->nactive = 1; - srt->active[srt->nactive - 1] = idx; - return 0; // FIXME: check for errs in this function -} int bcf_sr_sort_add_active(sr_sort_t *srt, int idx) { - hts_expand(int,idx+1,srt->mactive,srt->active); + if (hts_resize(int,srt->nactive+1,&srt->mactive,&srt->active,0) < 0) { + return -1; + } srt->nactive++; srt->active[srt->nactive - 1] = idx; return 0; // FIXME: check for errs in this function } +int bcf_sr_sort_set_active(sr_sort_t *srt, int idx) +{ + srt->nactive = 0; + return bcf_sr_sort_add_active(srt, idx); +} static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos) { if ( !srt->grp_str2int ) @@ -351,6 +358,8 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, bcf_sr_init_scores(srt); srt->grp_str2int = khash_str2int_init(); srt->var_str2int = khash_str2int_init(); + if (!srt->grp_str2int || !srt->var_str2int) + goto nomem; } int k; khash_t(str2int) *hash; @@ -376,29 +385,31 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, bcf_sr_t *reader = &readers->readers[ireader]; int rid = bcf_hdr_name2id(reader->header, chr); grp.nvar = 0; - hts_expand(int,reader->nbuffer,srt->moff,srt->off); + if (hts_resize(int,reader->nbuffer,&srt->moff,&srt->off,0) < 0) + goto nomem; srt->noff = 0; srt->str.l = 0; for (irec=1; irec<=reader->nbuffer; irec++) { + int e = 0; bcf1_t *line = reader->buffer[irec]; if ( line->rid!=rid || line->pos!=min_pos ) break; - if ( srt->str.l ) kputc(';',&srt->str); + if ( srt->str.l ) e |= kputc(';',&srt->str) < 0; srt->off[srt->noff++] = srt->str.l; size_t beg = srt->str.l; int end_pos = -1; if ( srt->pair & BCF_SR_PAIR_ID ) { - kputs(line->d.id,&srt->str); - kputc(':',&srt->str); + e |= kputs(line->d.id,&srt->str) < 0; + e |= kputc(':',&srt->str) < 0; } for (ivar=1; ivarn_allele; ivar++) { - if ( ivar>1 ) kputc(',',&srt->str); - kputs(line->d.allele[0],&srt->str); - kputc('>',&srt->str); - kputs(line->d.allele[ivar],&srt->str); + if ( ivar>1 ) e |= kputc(',',&srt->str) < 0; + e |= kputs(line->d.allele[0],&srt->str) < 0; + e |= kputc('>',&srt->str) < 0; + e |= kputs(line->d.allele[ivar],&srt->str) < 0; // If symbolic allele, check also the END tag in case there are multiple events, // such as s, starting at the same positions @@ -414,17 +425,20 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, } if ( end_pos ) { - kputc('/',&srt->str); - kputw(end_pos, &srt->str); + e |= kputc('/',&srt->str) < 0; + e |= kputw(end_pos, &srt->str) < 0; } } } if ( line->n_allele==1 ) { - kputs(line->d.allele[0],&srt->str); - kputsn(">.",2,&srt->str); + e |= kputs(line->d.allele[0],&srt->str) < 0; + e |= kputsn(">.",2,&srt->str) < 0; } + if (e) + goto nomem; + // Create new variant or attach to existing one. But careful, there can be duplicate // records with the same POS,REF,ALT (e.g. in dbSNP-b142). In such case, use a // hash table (srt->var_str2int) and a counter (var_idx) to ensure they are @@ -441,47 +455,82 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, if ( var->vcf[var->nvcf-1] != ireader ) break; srt->str.l = var_end; - kputw(var_idx, &srt->str); + if (kputw(var_idx, &srt->str) < 0) + goto nomem; var_str = beg + srt->str.s; var_idx++; } if ( ret==-1 ) { // the variant is not present, insert - ivar = srt->nvar++; - hts_expand0(var_t,srt->nvar,srt->mvar,srt->var); + if (hts_resize(var_t,srt->nvar+1,&srt->mvar,&srt->var,HTS_RESIZE_CLEAR) < 0) + goto nomem; + char *var_str_dup = strdup(var_str); + if (!var_str_dup) + goto nomem; + ivar = srt->nvar; srt->var[ivar].nvcf = 0; - khash_str2int_set(srt->var_str2int, strdup(var_str), ivar); + if (khash_str2int_set(srt->var_str2int, var_str_dup, ivar) < 0) + { + free(var_str_dup); + goto nomem; + } free(srt->var[ivar].str); // possible left-over from the previous position + srt->var[ivar].str = NULL; + srt->nvar++; } var_t *var = &srt->var[ivar]; var->nalt = line->n_allele - 1; var->type = bcf_get_variant_types(line); srt->str.s[var_end] = 0; if ( ret==-1 ) + { var->str = strdup(var_str); + if (!var->str) + goto nomem; + } int mvcf = var->mvcf; + if (hts_resize(int*, var->nvcf+1, &var->mvcf, &var->vcf, HTS_RESIZE_CLEAR)<0) + goto nomem; + if ( mvcf != var->mvcf ) + { + bcf1_t **new_rec = hts_realloc_p(var->rec, sizeof(bcf1_t*), var->mvcf); + if (!new_rec) + goto nomem; + var->rec = new_rec; + } + var->vcf[var->nvcf] = ireader; + var->rec[var->nvcf] = line; var->nvcf++; - hts_expand0(int*, var->nvcf, var->mvcf, var->vcf); - if ( mvcf != var->mvcf ) var->rec = hts_realloc_p(var->rec, sizeof(bcf1_t*), var->mvcf); - var->vcf[var->nvcf-1] = ireader; - var->rec[var->nvcf-1] = line; + if (hts_resize(var_t, grp.nvar+1, &grp.mvar, &grp.var, 0) < 0) + goto nomem; + grp.var[grp.nvar] = ivar; grp.nvar++; - hts_expand(var_t,grp.nvar,grp.mvar,grp.var); - grp.var[grp.nvar-1] = ivar; } char *grp_key = grp_create_key(srt); + if (!grp_key) + goto nomem; int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp); if ( ret==-1 ) { - igrp = srt->ngrp++; - hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp); + if (hts_resize(grp_t, srt->ngrp + 1, &srt->mgrp, &srt->grp, HTS_RESIZE_CLEAR) < 0) + { + free(grp_key); + goto nomem; + } + igrp = srt->ngrp; free(srt->grp[igrp].var); + srt->grp[igrp].var = NULL; srt->grp[igrp] = grp; srt->grp[igrp].key = grp_key; - khash_str2int_set(srt->grp_str2int, grp_key, igrp); + if (khash_str2int_set(srt->grp_str2int, grp_key, igrp) < 0) + { + free(grp_key); + goto nomem; + } + srt->ngrp++; memset(&grp,0,sizeof(grp_t)); } else @@ -494,10 +543,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, for (ivar=0; ivarnvar; ivar++) { if ( kbs_resize(&srt->var[ivar].mask, srt->ngrp) < 0 ) - { - fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__); - exit(1); - } + goto nomem; kbs_clear(srt->var[ivar].mask); } for (igrp=0; igrpngrp; igrp++) @@ -512,20 +558,18 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, // create the initial list of variant sets for (ivar=0; ivarnvar; ivar++) { + if (hts_resize(varset_t, srt->nvset + 1, &srt->mvset, &srt->vset, HTS_RESIZE_CLEAR) < 0) + goto nomem; ivset = srt->nvset++; - hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset); - varset_t *vset = &srt->vset[ivset]; + if (hts_resize(var_t, 1, &vset->mvar, &vset->var, HTS_RESIZE_CLEAR) < 0) + goto nomem; + vset->var[0] = ivar; vset->nvar = 1; - hts_expand0(var_t, vset->nvar, vset->mvar, vset->var); - vset->var[vset->nvar-1] = ivar; var_t *var = &srt->var[ivar]; vset->cnt = var->nvcf; if ( kbs_resize(&vset->mask, srt->ngrp) < 0 ) - { - fprintf(stderr, "[%s:%d %s] kbs_resize failed\n", __FILE__,__LINE__,__func__); - exit(1); - } + goto nomem; kbs_clear(vset->mask); kbs_bitwise_or(vset->mask, var->mask); @@ -545,8 +589,10 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, #endif // initialize the pairing matrix - hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat); - hts_expand(int, srt->nvset, srt->mcnt, srt->cnt); + if (hts_resize(int, srt->ngrp*srt->nvset, &srt->mpmat, &srt->pmat, 0) < 0) + goto nomem; + if (hts_resize(int, srt->nvset, &srt->mcnt, &srt->cnt, 0) < 0) + goto nomem; memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset); for (ivset=0; ivsetnvset; ivset++) { @@ -581,18 +627,28 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, if ( ipair!=-1 && ipair!=imax ) { imax = merge_vsets(srt, imax, ipair); + if (imax < 0) + goto nomem; continue; } - push_vset(srt, imax); + if (push_vset(srt, imax) < 0) + goto nomem; } srt->chr = chr; srt->pos = min_pos; - return 0; // FIXME: check for errs in this function + return 0; + + nomem: + readers->errnum = no_memory; + return -1; } +// Returns number of readers with the current line (may be 0) +// On error, returns 0 but readers->errnum will have been set + int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos) { int i,j; @@ -603,8 +659,14 @@ int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_po srt->sr = readers; if ( srt->nsr < readers->nreaders ) { - srt->vcf_buf = hts_realloc_p(srt->vcf_buf, sizeof(vcf_buf_t), - readers->nreaders); + vcf_buf_t *new_buf = hts_realloc_p(srt->vcf_buf, sizeof(vcf_buf_t), + readers->nreaders); + if (!new_buf) + { + readers->errnum = no_memory; + return 0; + } + srt->vcf_buf = new_buf; memset(srt->vcf_buf + srt->nsr, 0, sizeof(vcf_buf_t)*(readers->nreaders - srt->nsr)); if ( srt->msr < srt->nsr ) srt->msr = srt->nsr; } @@ -624,7 +686,13 @@ int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_po readers->has_line[srt->active[0]] = 1; return 1; } - if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) bcf_sr_sort_set(readers, srt, chr, min_pos); + if ( !srt->chr || srt->pos!=min_pos || strcmp(srt->chr,chr) ) + { + if (bcf_sr_sort_set(readers, srt, chr, min_pos) < 0) { + readers->errnum = no_memory; + return 0; + } + } if ( !srt->vcf_buf[0].nrec ) return 0; diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index a81ac371d..0599f5353 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -155,8 +155,22 @@ bcf_sr_t; typedef enum { - open_failed, not_bgzf, idx_load_failed, file_type_error, api_usage_error, - header_error, no_eof, no_memory, vcf_parse_error, bcf_read_error, noidx_error + bcf_sr_ok = 0, // No error + open_failed HTS_DEPRECATED_ENUM("Use bcf_sr_open_failed instead") = 0, + not_bgzf, // Input file not BGZF compressed + idx_load_failed, // Couldn't load the index + file_type_error, // Unsupported file type + api_usage_error, // API used incorrectly + header_error, // Header could not be read + no_eof, // Input file does not have an EOF block + no_memory, // Ran out of memory + vcf_parse_error, // Invalid VCF line + bcf_read_error, // Error reading file + noidx_error, // Input file not indexed, and streaming not possible + bcf_sr_open_failed, // Failed to open the input file + bcf_sr_seek_error, // Failed to seek to next location + bcf_sr_regions_error, // Failed to read the regions list + bcf_sr_samples_error, // Failed to read the samples list } bcf_sr_error; @@ -193,7 +207,7 @@ typedef struct bcf_srs_t bcf_srs_t; /** Allocate and initialize a bcf_srs_t struct. - * + * @return Pointer to a bcf_srs_t struct on success; NULL on failure. * The bcf_srs_t struct returned by a successful call should be freed * via bcf_sr_destroy() when it is no longer needed. */ @@ -204,18 +218,32 @@ bcf_srs_t *bcf_sr_init(void); HTSLIB_EXPORT void bcf_sr_destroy(bcf_srs_t *readers); + +/** Return a string describing a synced reader error + * @param errnum Error code (see enum bcf_sr_error) + * @return a char pointer to the error string + * @note The string returned should be treated as const. The caller should + * not try to edit or free it. + */ + HTSLIB_EXPORT char *bcf_sr_strerror(int errnum); +/** Set an option in the synced reader + * @param readers Synced reader struct + * @param opt Option to set + * @return 0 on success, 1 if the option was not recognised + */ + HTSLIB_EXPORT int bcf_sr_set_opt(bcf_srs_t *readers, bcf_sr_opt_t opt, ...); /** - * bcf_sr_set_threads() - allocates a thread-pool for use by the synced reader. - * @n_threads: size of thread pool + * Allocate a thread-pool for use by the synced reader. + * @param n_threads size of thread pool * - * Returns 0 if the call succeeded, or <0 on error. + * @return 0 if the call succeeded, or <0 on error. */ HTSLIB_EXPORT int bcf_sr_set_threads(bcf_srs_t *files, int n_threads); @@ -225,11 +253,11 @@ HTSLIB_EXPORT void bcf_sr_destroy_threads(bcf_srs_t *files); /** - * bcf_sr_add_reader() - open new reader - * @readers: holder of the open readers - * @fname: the VCF file + * Open new reader + * @param readers holder of the open readers + * @param fname the VCF file * - * Returns 1 if the call succeeded, or 0 on error. + * @return 1 if the call succeeded, or 0 on error. * * See also the bcf_srs_t data structure for parameters controlling * the reader's logic. @@ -239,19 +267,26 @@ HTSLIB_EXPORT int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname); /** - * bcf_sr_add_hreader() - open new reader using htsfile - * @readers: holder of the open readers - * @file_ptr: htsfile already opened - * @autoclose: close file along with reader or not, 1 - close, 0 - do not close - * @idxname: index file name for file in @file_ptr + * Open new reader using htsfile + * @param readers holder of the open readers + * @param file_ptr htsfile already opened + * @param autoclose close file along with reader or not, 1 - close, 0 - do not close + * @param idxname index file name for file in @file_ptr * - * Returns 1 if the call succeeded, or 0 on error. + * @return 1 if the call succeeded, or 0 on error. * * See also the bcf_srs_t data structure for parameters controlling * the reader's logic. * If idxname is NULL, uses file_ptr->fn to find index file. * With idxname as NULL, index file must be present along with the file with * default name + * + * If the call succeeds, and @p autoclose is 1, ownership of @p file_ptr + * passes to @p readers, and the file will be closed automatically when either + * the reader is removed by bcf_sr_remove_reader(), or when all readers + * are closed by bcf_sr_destroy(). If this function fails, ownership of + * the file remains with the caller and the caller is responsible for cleaning + * it up. */ HTSLIB_EXPORT int bcf_sr_add_hreader(bcf_srs_t *readers, htsFile *file_ptr, int autoclose, @@ -261,12 +296,18 @@ HTSLIB_EXPORT void bcf_sr_remove_reader(bcf_srs_t *files, int i); /** - * bcf_sr_next_line() - the iterator - * @readers: holder of the open readers + * Synced reader iterator + * @param readers holder of the open readers + * @return Number of readers with current line on success + * 0 when finished, or on error * * Returns the number of readers which have the current line * (bcf_sr_t.buffer[0]) set at this position. Use the bcf_sr_has_line macro to * determine which of the readers are set. + * + * A return value of 0 indicates either that the iterator finished, or + * there was an error. Callers must check @p readers->errnum to tell + * these states apart. */ HTSLIB_EXPORT int bcf_sr_next_line(bcf_srs_t *readers); @@ -280,43 +321,47 @@ int bcf_sr_next_line(bcf_srs_t *readers); /** - * bcf_sr_seek() - set all readers to selected position - * @seq: sequence name; NULL to seek to start - * @pos: 0-based coordinate + * Set all readers to selected position + * @param seq sequence name; NULL to seek to start + * @param pos 0-based coordinate + * @return Number of readers with iterators over the new sequence on success + * 0 on failure + * + * @note Callers must check @p readers->errnum to detect failures */ HTSLIB_EXPORT int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos); /** - * bcf_sr_set_samples() - sets active samples - * @readers: holder of the open readers - * @samples: this can be one of: file name with one sample per line; - * or column-separated list of samples; or '-' for a list of - * samples shared by all files. If first character is the - * exclamation mark, all but the listed samples are included. - * @is_file: 0: list of samples; 1: file with sample names + * Set active samples + * @param readers holder of the open readers + * @param samples this can be one of: file name with one sample per line; + * or column-separated list of samples; or '-' for a list of + * samples shared by all files. If first character is the + * exclamation mark, all but the listed samples are included. + * @param is_file 0: list of samples; 1: file with sample names * - * Returns 1 if the call succeeded, or 0 on error. + * @return 1 if the call succeeded, or 0 on error. */ HTSLIB_EXPORT int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); -/** - * bcf_sr_set_targets(), bcf_sr_set_regions() - init targets/regions - * @readers: holder of the open readers - * @targets: list of regions, one-based and inclusive. - * @is_fname: 0: targets is a comma-separated list of regions (chr,chr:from-to) - * 1: targets is a tabix indexed file with a list of regions - * ( or ) +/** Set targets filter + * @param readers holder of the open readers + * @param targets list of regions, one-based and inclusive. + * @param is_fname 0: targets is a comma-separated list of regions (chr,chr:from-to) + * 1: targets is a tabix indexed file with a list of regions + * ( or ) + * @param alleles One-based index of column listing alleles (see below) + * @return 0 if the call succeeded, or -1 on error. * - * Returns 0 if the call succeeded, or -1 on error. - * - * Both functions behave the same way, unlisted positions will be skipped by - * bcf_sr_next_line(). However, there is an important difference: regions use + * Both bcf_sr_set_targets() and bcf_sr_set_regions() behave the same way, + * unlisted positions will be skipped by bcf_sr_next_line(). + * However, there is an important difference: regions uses an * index to jump to desired positions while targets streams the whole files * and merely skip unlisted positions. * - * Moreover, bcf_sr_set_targets() accepts an optional parameter $alleles which + * Moreover, bcf_sr_set_targets() accepts an optional parameter @p alleles which * is interpreted as a 1-based column index in the tab-delimited file where * alleles are listed. This in principle enables to perform the COLLAPSE_* * logic also with tab-delimited files. However, the current implementation @@ -335,35 +380,53 @@ int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); HTSLIB_EXPORT int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles); +/** Set region list + * @param readers holder of the open readers + * @param targets list of regions, one-based and inclusive. + * @param is_fname 0: targets is a comma-separated list of regions (chr,chr:from-to) + * 1: targets is a tabix indexed file with a list of regions + * ( or ) + * @return 0 if the call succeeded, or -1 on error. + * + * Both bcf_sr_set_targets() and bcf_sr_set_regions() behave the same way, + * unlisted positions will be skipped by bcf_sr_next_line(). + * However, there is an important difference: regions uses an + * index to jump to desired positions while targets streams the whole files + * and merely skip unlisted positions. + * + * API notes: + * - calling bcf_sr_set_regions AFTER readers have been initialized will + * reposition the readers and discard all previous regions. + */ HTSLIB_EXPORT int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); - - -/* - * bcf_sr_regions_init() - * @regions: regions can be either a comma-separated list of regions - * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or - * tab-delimited file (the default). Uncompressed files - * are stored in memory while bgzip-compressed and tabix-indexed - * region files are streamed. - * @is_file: 0: regions is a comma-separated list of regions - * (chr|chr:pos|chr:from-to|chr:from-) - * 1: VCF, BED or tab-delimited file - * @chr, from, to: - * Column indexes of chromosome, start position and end position - * in the tab-delimited file. The positions are 1-based and - * inclusive. - * These parameters are ignored when reading from VCF, BED or - * tabix-indexed files. When end position column is not present, - * supply 'from' in place of 'to'. When 'to' is negative, first - * abs(to) will be attempted and if that fails, 'from' will be used - * instead. - * If chromosome name contains the characters ':' or '-', it should - * be put in curly brackets, for example as "{weird-chr-name:1-2}:1000-2000" +/** + * Create a regions list + * @param regions regions can be either a comma-separated list of regions + * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or + * tab-delimited file (the default). Uncompressed files + * are stored in memory while bgzip-compressed and tabix-indexed + * region files are streamed. + * @param is_file 0: regions is a comma-separated list of regions + * (chr|chr:pos|chr:from-to|chr:from-) + * 1: VCF, BED or tab-delimited file + * @param chr Column index of chromosome + * @param from Column index of start position (1-based, inclusive) + * @param to Column index of end position (1-based, inclusive) + * @return Pointer to a populated bcf_sr_regions_t struct on success + * NULL on failure * * The bcf_sr_regions_t struct returned by a successful call should be freed * via bcf_sr_regions_destroy() when it is no longer needed. + * + * These @p chr, @p from and @p tp parameters are ignored when reading + * from VCF, BED or tabix-indexed files. When end position column is not + * present, supply 'from' in place of 'to'. When 'to' is negative, first + * abs(to) will be attempted and if that fails, 'from' will be used instead. + * + * If chromosome name contains the characters ':' or '-', it should + * be put in curly brackets, for example as "{weird-chr-name:1-2}:1000-2000" */ HTSLIB_EXPORT bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int chr, int from, int to); @@ -372,32 +435,41 @@ HTSLIB_EXPORT void bcf_sr_regions_destroy(bcf_sr_regions_t *regions); /* - * bcf_sr_regions_seek() - seek to the chromosome block - * + * Seek regions to the chromosome block + * @param regions Pointer to regions list + * @param chr Chromosome to seek to * Returns 0 on success or -1 on failure. Sets reg->seq appropriately and * reg->start,reg->end to -1. */ HTSLIB_EXPORT int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr); -/* - * bcf_sr_regions_next() - retrieves next region. Returns 0 on success and -1 - * when all regions have been read. The fields reg->seq, reg->start and - * reg->end are filled with the genomic coordinates on success or with - * NULL,-1,-1 when no region is available. The coordinates are 0-based, - * inclusive. +/** + * Retrieve next region. + * @return 0 on success + * -1 when all regions have been read + * -2 on failure + * + * The fields reg->seq, reg->start and reg->end are filled with the + * genomic coordinates on success or with NULL,-1,-1 when no region is + * available. The coordinates are 0-based, inclusive. */ HTSLIB_EXPORT int bcf_sr_regions_next(bcf_sr_regions_t *reg); -/* - * bcf_sr_regions_overlap() - checks if the interval overlaps any of +/** Check if interval seq:start-end overlaps any regions + * @param reg Regions list + * @param seq Reference name + * @param start Start of interval in seq (0-based, inclusive) + * @param end End of interval in seq (0-based, inclusive) + * @return 0 if the position is in regions + * -1 if the position is not in the regions and more regions exist + * -2 if not in the regions and there are no more regions left + * -3 on failure + * + * Checks if the interval overlaps any of * the regions, the coordinates are 0-based, inclusive. The coordinate queries * must come in ascending order. - * - * Returns 0 if the position is in regions; -1 if the position is not in the - * regions and more regions exist; -2 if not in the regions and there are no more - * regions left. */ HTSLIB_EXPORT int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end); diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index d7780d6e1..6353ea824 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -77,7 +77,7 @@ aux_t; static bcf_sr_regions_t *bcf_sr_regions_alloc(void); static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end); static bcf_sr_regions_t *_regions_init_string(const char *str); -static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); +static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec, bcf_sr_error *errnum); static void _regions_sort_and_merge(bcf_sr_regions_t *reg); static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end, int missed_reg_handler); static void bcf_sr_seek_start(bcf_srs_t *readers); @@ -86,7 +86,7 @@ char *bcf_sr_strerror(int errnum) { switch (errnum) { - case open_failed: + case bcf_sr_open_failed: return strerror(errno); case not_bgzf: return "not compressed with bgzip"; @@ -108,6 +108,12 @@ char *bcf_sr_strerror(int errnum) return "BCF read error"; case noidx_error: return "merge of unindexed files failed"; + case bcf_sr_seek_error: + return "failed to seek to next location"; + case bcf_sr_regions_error: + return "failed to read regions/targets list"; + case bcf_sr_samples_error: + return "failed to read samples list"; default: return ""; } } @@ -169,7 +175,8 @@ static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters) else { str.l = 0; - kputsn(prev, tmp-prev, &str); + if (kputsn(prev, tmp-prev, &str) < 0) + goto err; out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s); if ( out[nout]>=0 ) nout++; } @@ -194,12 +201,21 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) { if ( readers->regions ) bcf_sr_regions_destroy(readers->regions); readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); + if (!readers->regions) + { + readers->errnum = bcf_sr_regions_error; + return -1; + } bcf_sr_seek_start(readers); return 0; } readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); - if ( !readers->regions ) return -1; + if ( !readers->regions ) + { + readers->errnum = bcf_sr_regions_error; + return -1; + } readers->explicit_regs = 1; readers->require_index = REQUIRE_IDX_; readers->regions->overlap = BCF_SR_AUX(readers)->regions_overlap; @@ -211,6 +227,7 @@ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int if ( readers->nreaders || readers->targets ) { hts_log_error("Must call bcf_sr_set_targets() before bcf_sr_add_reader()"); + readers->errnum = api_usage_error; return -1; } if ( targets[0]=='^' ) @@ -219,7 +236,11 @@ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int targets++; } readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2); - if ( !readers->targets ) return -1; + if ( !readers->targets ) + { + readers->errnum = bcf_sr_regions_error; + return -1; + } readers->targets_als = alleles; readers->targets->overlap = BCF_SR_AUX(readers)->targets_overlap; return 0; @@ -231,12 +252,18 @@ int bcf_sr_set_threads(bcf_srs_t *files, int n_threads) return 0; files->p = calloc(1, sizeof(*files->p)); - if (!files->p) { + if (!files->p) + { files->errnum = no_memory; return -1; } if (!(files->p->pool = hts_tpool_init(n_threads))) + { + files->errnum = no_memory; + free(files->p); + files->p = NULL; return -1; + } return 0; } @@ -248,6 +275,7 @@ void bcf_sr_destroy_threads(bcf_srs_t *files) { if (files->p->pool) hts_tpool_destroy(files->p->pool); free(files->p); + files->p = NULL; } int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) @@ -260,13 +288,14 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) vcf_open_mode(fmode+1, fname, NULL); htsFile* file_ptr = hts_open(fname, fmode); if ( ! file_ptr ) { - files->errnum = open_failed; + files->errnum = bcf_sr_open_failed; return 0; } //get idx name and pass to add_hreader idxname = strstr(fname, HTS_IDX_DELIM); if (idxname) idxname += strlen(HTS_IDX_DELIM); if (!(ret = bcf_sr_add_hreader(files, file_ptr, 1, idxname))) { + // bcf_sr_add_hreader() will have set files->errnum hts_close(file_ptr); //failed, close the file } return ret; @@ -274,17 +303,30 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) int bcf_sr_add_hreader(bcf_srs_t *files, htsFile *file_ptr, int autoclose, const char *idxname) { - aux_t *auxdata = NULL; + aux_t *auxdata = BCF_SR_AUX(files); + assert(auxdata != NULL); if ( ! file_ptr ) { - files->errnum = open_failed; + files->errnum = bcf_sr_open_failed; return 0; } - files->has_line = hts_realloc_ps(files->has_line, sizeof(*files->has_line), - files->nreaders, 1); + int *new_has_line = hts_realloc_ps(files->has_line, sizeof(*files->has_line), + files->nreaders, 1); + if (!new_has_line) + goto nomem; + files->has_line = new_has_line; files->has_line[files->nreaders] = 0; - files->readers = hts_realloc_ps(files->readers, sizeof(*files->readers), - files->nreaders, 1); + bcf_sr_t *new_readers = hts_realloc_ps(files->readers, sizeof(*files->readers), + files->nreaders, 1); + if (!new_readers) + goto nomem; + files->readers = new_readers; + int *new_closefile = hts_realloc_ps(auxdata->closefile, sizeof(int), + files->nreaders, 1); + if (!new_closefile) + goto nomem; + auxdata->closefile = new_closefile; + auxdata->closefile[files->nreaders] = 0; // Will be updated later bcf_sr_t *reader = &files->readers[files->nreaders++]; memset(reader,0,sizeof(bcf_sr_t)); @@ -386,9 +428,15 @@ int bcf_sr_add_hreader(bcf_srs_t *files, htsFile *file_ptr, int autoclose, const } reader->fname = strdup(file_ptr->fn); + if (!reader->fname) + goto nomem; + if ( files->apply_filters ) + { reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); - + if (!reader->filter_ids) + goto nomem; + } // Update list of chromosomes if ( !files->explicit_regs && !files->streaming ) { @@ -399,16 +447,17 @@ int bcf_sr_add_hreader(bcf_srs_t *files, htsFile *file_ptr, int autoclose, const { files->regions = bcf_sr_regions_alloc(); if ( !files->regions ) - { - hts_log_error("Cannot allocate regions data structure"); - return 0; - } + goto nomem; } names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); for (i=0; iregions, names[i], -1, -1); + if (_regions_add(files->regions, names[i], -1, -1) < 0) + { + free(names); + goto nomem; + } } free(names); _regions_sort_and_merge(files->regions); @@ -436,26 +485,32 @@ int bcf_sr_add_hreader(bcf_srs_t *files, htsFile *file_ptr, int autoclose, const } } - if ((auxdata = BCF_SR_AUX(files))) { - //store closure status for htsfile - int *tmp = hts_realloc_p(auxdata->closefile, sizeof(int), files->nreaders); - if (!tmp) { - hts_log_error("Failed to allocate memory"); - return 0; - } - tmp[files->nreaders - 1] = autoclose; - auxdata->closefile = tmp; - } + // Take ownership of the input file + auxdata->closefile[files->nreaders - 1] = autoclose; return 1; + + nomem: + files->errnum = no_memory; + return 0; } bcf_srs_t *bcf_sr_init(void) { bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t)); + if (!files) + return NULL; files->aux = (aux_t*) calloc(1,sizeof(aux_t)); + if (!files->aux) + { + free(files); + return NULL; + } + + // sr_sort_t struct already allocated, so this cannot fail bcf_sr_sort_init(&BCF_SR_AUX(files)->sort); + bcf_sr_set_opt(files,BCF_SR_REGIONS_OVERLAP,1); bcf_sr_set_opt(files,BCF_SR_TARGETS_OVERLAP,0); return files; @@ -463,6 +518,8 @@ bcf_srs_t *bcf_sr_init(void) static void bcf_sr_destroy1(bcf_sr_t *reader, int closefile) { + if (!reader) + return; free(reader->fname); if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); @@ -482,6 +539,9 @@ static void bcf_sr_destroy1(bcf_sr_t *reader, int closefile) void bcf_sr_destroy(bcf_srs_t *files) { + if (!files) + return; + int i; int *autoclose = BCF_SR_AUX(files)->closefile; @@ -506,6 +566,9 @@ void bcf_sr_remove_reader(bcf_srs_t *files, int i) assert( !files->samples ); // not ready for this yet int *autoclose = BCF_SR_AUX(files)->closefile; + if (i < 0 || i >= files->nreaders) + return; + bcf_sr_sort_remove_reader(files, &BCF_SR_AUX(files)->sort, i); bcf_sr_destroy1(&files->readers[i], autoclose[i]); if ( i+1 < files->nreaders ) @@ -560,12 +623,16 @@ static inline int has_filter(bcf_sr_t *reader, bcf1_t *line) return 0; } +// Reset a reader and make it iterate over the range seq:start-end +// Returns 0 on success +// -1 if seq isn't in the index +// -2 if it couldn't make a new iterator static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_pos_t end) { if ( end>=MAX_CSI_COOR ) { hts_log_error("The coordinate is out of csi index limit: %"PRIhts_pos, end+1); - exit(1); + return -2; } if ( reader->itr ) { @@ -587,14 +654,16 @@ static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_ } if (!reader->itr) { hts_log_error("Could not seek: %s:%"PRIhts_pos"-%"PRIhts_pos, seq, start + 1, end + 1); - abort(); + return -2; } return 0; } /* * _readers_next_region() - jumps to next region if necessary - * Returns 0 on success or -1 when there are no more regions left + * Returns 0 on success + * -1 when there are no more regions left + * -2 on error (also sets files->errnum) */ static int _readers_next_region(bcf_srs_t *files) { @@ -612,11 +681,25 @@ static int _readers_next_region(bcf_srs_t *files) // No lines in the buffer, need to open new region or quit. int prev_iseq = files->regions->iseq; hts_pos_t prev_end = files->regions->end; - if ( bcf_sr_regions_next(files->regions)<0 ) return -1; + int res = bcf_sr_regions_next(files->regions); + if ( res<0 ) + { + if (res < -1) files->errnum = bcf_sr_seek_error; + return res; + } files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : -1; for (i=0; inreaders; i++) - _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end); + { + res = _reader_seek(&files->readers[i], + files->regions->seq_names[files->regions->iseq], + files->regions->start,files->regions->end); + if (res < -1) + { + files->errnum = bcf_sr_seek_error; + return res; + } + } return 0; } @@ -652,6 +735,7 @@ static void _set_variant_boundaries(bcf1_t *rec, hts_pos_t *beg, hts_pos_t *end) /* * _reader_fill_buffer() - buffers all records with the same coordinate + * returns 0 on success, -1 on failure (also sets files->errnum) */ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { @@ -663,54 +747,71 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) // Fill the buffer with records starting at the same position int i, ret = 0; + bcf_sr_error err = bcf_sr_ok; while (1) { - if ( reader->nbuffer+1 >= reader->mbuffer ) + if ( reader->mbuffer - reader->nbuffer <= 1 ) { // Increase buffer size + bcf1_t **new_buf = hts_realloc_ps(reader->buffer, sizeof(bcf1_t*), + reader->mbuffer, 8); + if (!new_buf) + { + err = files->errnum = no_memory; + break; + } + reader->buffer = new_buf; reader->mbuffer += 8; - reader->buffer = hts_realloc_p(reader->buffer, sizeof(bcf1_t*), reader->mbuffer); for (i=8; i>0; i--) // initialize { reader->buffer[reader->mbuffer-i] = bcf_init1(); + if (!reader->buffer[reader->mbuffer-i]) + break; reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 } + if (i > 0) + { + reader->mbuffer -= i; // Adjust to number that got initialized + err = files->errnum = no_memory; + break; + } } if ( files->streaming ) { if ( reader->file->format.format==vcf ) { ret = hts_getline(reader->file, KS_SEP_LINE, &files->tmps); - if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < -1 ) err = files->errnum = bcf_read_error; if ( ret < 0 ) break; // no more lines or an error ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); - if ( ret<0 ) { files->errnum = vcf_parse_error; break; } + if ( ret<0 ) { err = files->errnum = vcf_parse_error; break; } } else if ( reader->file->format.format==bcf ) { ret = bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1]); - if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < -1 ) err = files->errnum = bcf_read_error; if ( ret < 0 ) break; // no more lines or an error } else { - hts_log_error("Fixme: not ready for this"); - exit(1); + hts_log_error("Unsupported file type for streaming"); + err = files->errnum = file_type_error; + break; } } else if ( reader->tbx_idx ) { ret = tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps); - if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < -1 ) err = files->errnum = bcf_read_error; if ( ret < 0 ) break; // no more lines or an error ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); - if ( ret<0 ) { files->errnum = vcf_parse_error; break; } + if ( ret<0 ) { err = files->errnum = vcf_parse_error; break; } } else { ret = bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1]); - if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < -1 ) err = files->errnum = bcf_read_error; if ( ret < 0 ) break; // no more lines or an error bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } @@ -731,8 +832,10 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) _set_variant_boundaries(reader->buffer[reader->nbuffer+1], &beg,&end); else { - hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); - exit(1); + hts_log_error("Invalid BCF_SR_REGIONS_OVERLAP value: %d", + BCF_SR_AUX(files)->regions_overlap); + err = files->errnum = api_usage_error; + break; } if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue; } @@ -759,9 +862,9 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) if ( files->require_index==ALLOW_NO_IDX_ && reader->buffer[reader->nbuffer]->rid < reader->buffer[1]->rid ) { hts_log_error("Sequences out of order, cannot stream multiple unindexed files: %s", reader->fname); - exit(1); + err = files->errnum = noidx_error; } - return 0; // FIXME: Check for more errs in this function + return err == bcf_sr_ok ? 0 : -1; } /* @@ -779,6 +882,9 @@ static void _reader_shift_buffer(bcf_sr_t *reader) reader->nbuffer--; } +// Returns number of active readers (may be 0) +// On error, returns 0 but readers->errnum will have been set + static int next_line(bcf_srs_t *files) { const char *chr = NULL; @@ -788,13 +894,18 @@ static int next_line(bcf_srs_t *files) while ( 1 ) { // Get all readers ready for the next region. - if ( files->regions && _readers_next_region(files)<0 ) break; - + if ( files->regions ) + { + int r = _readers_next_region(files); + if ( r < -1 ) return 0; // Will have set files->errnum + if ( r < 0 ) break; + } // Fill buffers and find the minimum chromosome int i, min_rid = INT32_MAX; for (i=0; inreaders; i++) { - _reader_fill_buffer(files, &files->readers[i]); + if (_reader_fill_buffer(files, &files->readers[i]) < 0) + return 0; // Will have set files->errnum if ( files->require_index==ALLOW_NO_IDX_ ) { if ( !files->readers[i].nbuffer ) continue; @@ -812,11 +923,25 @@ static int next_line(bcf_srs_t *files) { min_pos = files->readers[i].buffer[1]->pos; chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]); - assert(chr); - bcf_sr_sort_set_active(&BCF_SR_AUX(files)->sort, i); + if (!chr) + { + files->errnum = bcf_read_error; + return 0; + } + if (bcf_sr_sort_set_active(&BCF_SR_AUX(files)->sort, i) < 0) + { + files->errnum = no_memory; + return 0; + } } else if ( min_pos==files->readers[i].buffer[1]->pos ) - bcf_sr_sort_add_active(&BCF_SR_AUX(files)->sort, i); + { + if (bcf_sr_sort_add_active(&BCF_SR_AUX(files)->sort, i) < 0) + { + files->errnum = no_memory; + return 0; + } + } } if ( min_pos==HTS_POS_MAX ) { @@ -843,10 +968,17 @@ static int next_line(bcf_srs_t *files) _set_variant_boundaries(files->readers[i].buffer[1], &beg,&end); else { - hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); - exit(1); + hts_log_error("Invalid BCF_SR_TARGETS_OVERLAP value: %d", + BCF_SR_AUX(files)->targets_overlap); + files->errnum = api_usage_error; + return 0; } int overlap = bcf_sr_regions_overlap(files->targets, chr, beg, end)==0 ? 1 : 0; + if (overlap < -1) + { + files->errnum = no_memory; + return 0; + } if ( (!files->targets_exclude && !overlap) || (files->targets_exclude && overlap) ) _reader_shift_buffer(&files->readers[i]); else @@ -879,7 +1011,8 @@ int bcf_sr_next_line(bcf_srs_t *files) for (i=0; inreaders; i++) if ( files->has_line[i] ) break; - if ( _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0]) ) return ret; + int r = _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0], &files->errnum); + if ( r ) return r > 0 ? ret : 0; // Check if there are more duplicate lines in the buffers. If not, return this line as if it // matched the targets, even if there is a type mismatch @@ -928,11 +1061,22 @@ int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos) // - position regions to the requested position (bcf_sr_regions_overlap) bcf_sr_seek_start(readers); if ( khash_str2int_get(readers->regions->seq_hash, seq, &i)>=0 ) readers->regions->iseq = i; - _bcf_sr_regions_overlap(readers->regions, seq, pos, pos, 0); + int r = _bcf_sr_regions_overlap(readers->regions, seq, pos, pos, 0); + if (r < -2) + { + readers->errnum = bcf_sr_seek_error; + return 0; + } for (i=0; inreaders; i++) { - nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1); + r = _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1); + if (r < -1) + { + readers->errnum = bcf_sr_seek_error; + return 0; + } + nret += r >= 0 ? r : 0; } return nret; } @@ -942,21 +1086,36 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) int i, j, nsmpl, free_smpl = 0; char **smpl = NULL; - void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL; + void *exclude = NULL; + + if (fname[0]=='^') + { + exclude = khash_str2int_init(); + if (!exclude) + { + files->errnum = no_memory; + return 0; + } + } if ( exclude || strcmp("-",fname) ) // "-" stands for all samples { smpl = hts_readlist(fname, is_file, &nsmpl); if ( !smpl ) { hts_log_error("Could not read the file: \"%s\"", fname); + khash_str2int_destroy(exclude); + files->errnum = bcf_sr_samples_error; return 0; } + free_smpl = 1; if ( exclude ) { for (i=0; isamples = hts_realloc_ps(files->samples, sizeof(const char*), - files->n_smpl, 1); - files->samples[files->n_smpl++] = strdup(smpl[i]); + char **new_samples = hts_realloc_ps(files->samples, sizeof(char *), + files->n_smpl, 1); + if (!new_samples) + goto nomem; + files->samples = new_samples; + if ( (files->samples[files->n_smpl] = strdup(smpl[i])) == NULL ) + goto nomem; + files->n_smpl++; } if ( exclude ) khash_str2int_destroy(exclude); @@ -993,6 +1157,7 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) { for (i=0; in_smpl ) @@ -1005,11 +1170,26 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) { bcf_sr_t *reader = &files->readers[i]; reader->samples = hts_malloc_p(sizeof(int), files->n_smpl); + if (!reader->samples) + { + files->errnum = no_memory; + return 0; + } reader->n_smpl = files->n_smpl; for (j=0; jn_smpl; j++) reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]); } return 1; + + nomem: + if ( exclude ) khash_str2int_destroy(exclude); + if ( free_smpl ) + { + for (i=0; ierrnum = no_memory; + return 0; } // Allocate a new region list structure. @@ -1025,6 +1205,7 @@ static bcf_sr_regions_t *bcf_sr_regions_alloc(void) // Add a new region into a list. On input the coordinates are 1-based, inclusive, then stored 0-based, // inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions(). +// Returns 0 on success, -1 on failure static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end) { if ( start==-1 && end==-1 ) @@ -1037,28 +1218,52 @@ static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, } if ( !reg->seq_hash ) + { reg->seq_hash = khash_str2int_init(); + if ( !reg->seq_hash ) + goto nomem; + } int iseq; if ( khash_str2int_get(reg->seq_hash, chr, &iseq)<0 ) { // the chromosome block does not exist - iseq = reg->nseqs++; - reg->seq_names = hts_realloc_p(reg->seq_names,sizeof(char*),reg->nseqs); - reg->regs = hts_realloc_p(reg->regs,sizeof(region_t),reg->nseqs); - memset(®->regs[reg->nseqs-1],0,sizeof(region_t)); + char **new_seq_names = hts_realloc_ps(reg->seq_names, + sizeof(char*), reg->nseqs, 1); + if (!new_seq_names) + goto nomem; + reg->seq_names = new_seq_names; + region_t *new_regs = hts_realloc_ps(reg->regs,sizeof(region_t), + reg->nseqs, 1); + if (!new_regs) + goto nomem; + reg->regs = new_regs; + + iseq = reg->nseqs; + memset(®->regs[iseq],0,sizeof(region_t)); reg->seq_names[iseq] = strdup(chr); + if (!reg->seq_names[iseq]) + goto nomem; reg->regs[iseq].creg = -1; - khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq); + if (khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq) < 0) + { + free(reg->seq_names[iseq]); + goto nomem; + } + reg->nseqs++; } region_t *creg = ®->regs[iseq]; - hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs); + if (hts_resize(region1_t,creg->nregs+1,&creg->mregs,&creg->regs,0) < 0) + goto nomem; creg->regs[creg->nregs].start = start; creg->regs[creg->nregs].end = end; creg->nregs++; - return 0; // FIXME: check for errs in this function + return 0; + + nomem: + return -1; } static int regions_cmp(const void *aptr, const void *bptr) @@ -1102,10 +1307,15 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg) // Recognises regions in the form chr, chr:pos, chr:beg-end, chr:beg-, {weird-chr-name}:pos. // Cannot use hts_parse_region() as that requires the header and if header is not present, // wouldn't learn the chromosome name. +// Returns populated bcf_sr_regions_t struct on success; NULL on failure static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = bcf_sr_regions_alloc(); - if ( !reg ) return NULL; + if ( !reg ) + { + hts_log_error("Out of memory"); + return NULL; + } kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; @@ -1122,12 +1332,14 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) goto exit_nicely; } ep++; - kputsn(sp+1,ep-sp-2,&tmp); + if (kputsn(sp+1,ep-sp-2,&tmp) < 0) + goto exit_nicely_nomem; } else { while ( *ep && *ep!=',' && *ep!=':' ) ep++; - kputsn(sp,ep-sp,&tmp); + if (kputsn(sp,ep-sp,&tmp) < 0) + goto exit_nicely_nomem; } if ( *ep==':' ) { @@ -1140,7 +1352,8 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } if ( !*ep || *ep==',' ) { - _regions_add(reg, tmp.s, from, from); + if (_regions_add(reg, tmp.s, from, from) < 0) + goto exit_nicely_nomem; sp = ep; continue; } @@ -1158,13 +1371,18 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) goto exit_nicely; } if ( sp==ep ) to = MAX_CSI_COOR-1; - _regions_add(reg, tmp.s, from, to); + if (_regions_add(reg, tmp.s, from, to) < 0) + goto exit_nicely_nomem; if ( !*ep ) break; sp = ep; } else if ( !*ep || *ep==',' ) { - if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); + if ( tmp.l ) + { + if (_regions_add(reg, tmp.s, -1, -1) < 0) + goto exit_nicely_nomem; + } if ( !*ep ) break; sp = ++ep; } @@ -1177,6 +1395,9 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) free(tmp.s); return reg; +exit_nicely_nomem: + hts_log_error("Out of memory"); + exit_nicely: bcf_sr_regions_destroy(reg); free(tmp.s); @@ -1248,9 +1469,15 @@ static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char ** bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito) { bcf_sr_regions_t *reg; + + if (!regions) + return NULL; + if ( !is_file ) { reg = _regions_init_string(regions); + if (!reg) + return NULL; _regions_sort_and_merge(reg); return reg; } @@ -1272,12 +1499,13 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr size_t iline = 0; int len = strlen(regions); int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1; + int r; if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1; if ( reg->file->format.format==vcf ) ito = 1; // read the whole file, tabix index is not present - while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) + while ( ( r = hts_getline(reg->file, KS_SEP_LINE, ®->line) ) > 0 ) { iline++; char *chr, *chr_end; @@ -1292,7 +1520,7 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr { hts_log_error("Could not parse %zu-th line of file %s, using the columns %d,%d[,%d]", iline, regions,ichr+1,ifrom+1,ito+1); - hts_close(reg->file); reg->file = NULL; free(reg); + bcf_sr_regions_destroy(reg); return NULL; } ito = ifrom; @@ -1302,9 +1530,17 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr if ( !ret ) continue; if ( is_bed ) from++; *chr_end = 0; - _regions_add(reg, chr, from, to); + if (_regions_add(reg, chr, from, to) < 0) + goto nomem; *chr_end = '\t'; } + if ( r < -1 ) + { + hts_log_error("Error reading file: \"%s\" : %s", + regions, strerror(errno)); + bcf_sr_regions_destroy(reg); + return NULL; + } hts_close(reg->file); reg->file = NULL; if ( !reg->nseqs ) { free(reg); return NULL; } _regions_sort_and_merge(reg); @@ -1313,15 +1549,27 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs); if ( !reg->seq_hash ) + { reg->seq_hash = khash_str2int_init(); + if (!reg->seq_hash) + goto nomem; + } int i; for (i=0; inseqs; i++) { - khash_str2int_set(reg->seq_hash,reg->seq_names[i],i); + if (khash_str2int_set(reg->seq_hash,reg->seq_names[i],i) < 0) + goto nomem; } reg->fname = strdup(regions); + if (!reg->fname) + goto nomem; reg->is_bin = 1; return reg; + + nomem: + hts_log_error("Out of memory"); + bcf_sr_regions_destroy(reg); + return NULL; } void bcf_sr_regions_destroy(bcf_sr_regions_t *reg) @@ -1420,7 +1668,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) { // tabix index present, reading a chromosome block ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, ®->line); - if ( ret<0 ) { reg->iseq = -1; return -1; } + if ( ret<0 ) { reg->iseq = -1; return ret; } } else { @@ -1435,21 +1683,27 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) hts_log_error("Could not open file: %s", reg->fname); reg->file = NULL; bcf_sr_regions_destroy(reg); - return -1; + return -2; } reg->is_bin = 0; } // tabix index absent, reading the whole file ret = reg->file ? hts_getline(reg->file, KS_SEP_LINE, ®->line) : -1; - if ( ret<0 ) { reg->iseq = -1; return -1; } + if ( ret<-1) + { + hts_log_error("Error reading \"%s\": %s\n", + reg->fname, strerror(errno)); + return -2; + } + if ( ret<0 ) { reg->iseq = -1; return ret; } } ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to); if ( ret<0 ) { hts_log_error("Could not parse the file %s, using the columns %d,%d,%d", reg->fname,ichr+1,ifrom+1,ito+1); - return -1; + return -2; } } if ( is_bed ) from++; @@ -1459,7 +1713,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) { hts_log_error("Broken tabix index? The sequence \"%s\" not in dictionary [%s]", chr, reg->line.s); - exit(1); + return -2; } *chr_end = '\t'; @@ -1468,13 +1722,14 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) return 0; } -static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec) +static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec, bcf_sr_error *errnum) { if ( reg->regs ) { // payload is not supported for in-memory regions, switch to regidx instead in future hts_log_error("Compressed and indexed targets file is required"); - exit(1); + *errnum = api_usage_error; + return -1; } int i = 0, max_len = 0; @@ -1493,9 +1748,11 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re if ( *se==',' ) reg->nals++; se++; } - ks_resize(®->als_str, se-ss+1+reg->nals); + if (ks_resize(®->als_str, se-ss+1+reg->nals) < 0) + goto nomem; reg->als_str.l = 0; - hts_expand(char*,reg->nals,reg->mals,reg->als); + if (hts_resize(char*,reg->nals,®->mals,®->als, 0) < 0) + goto nomem; reg->nals = 0; se = ss; @@ -1504,14 +1761,16 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re if ( *se=='\t' ) break; if ( *se!=',' ) continue; reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; - kputsn(ss,se-ss,®->als_str); + if (kputsn(ss,se-ss,®->als_str) < 0) + goto nomem; if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; reg->als_str.l++; reg->nals++; ss = ++se; } reg->als[reg->nals] = ®->als_str.s[reg->als_str.l]; - kputsn(ss,se-ss,®->als_str); + if (kputsn(ss,se-ss,®->als_str) < 0) + goto nomem; if ( ®->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = ®->als_str.s[reg->als_str.l] - reg->als[reg->nals]; reg->nals++; reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types @@ -1520,6 +1779,10 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re if ( reg->als_type & VCF_INDEL ) return type & VCF_INDEL ? 1 : 0; return !(type & VCF_INDEL) ? 1 : 0; + + nomem: + *errnum = no_memory; + return -1; } int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end) @@ -1537,9 +1800,12 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p { // flush regions left on previous chromosome if ( missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) - bcf_sr_regions_flush(reg); - - bcf_sr_regions_seek(reg, seq); + { + if ( bcf_sr_regions_flush(reg) < 0 ) + return -3; + } + if ( bcf_sr_regions_seek(reg, seq) < 0 ) + return -3; reg->start = reg->end = -1; } if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome @@ -1548,7 +1814,9 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p while ( iseq==reg->iseq && reg->end < start ) { - if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left + int r = bcf_sr_regions_next(reg); + if ( r < -1 ) return -3; // Something went wrong + if ( r < 0 ) return -2; // no more regions left if ( reg->iseq != iseq ) return -1; // does not overlap any regions if ( missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data); } @@ -1558,8 +1826,10 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p int bcf_sr_regions_flush(bcf_sr_regions_t *reg) { + int r; if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return 0; - while ( !bcf_sr_regions_next(reg) ) reg->missed_reg_handler(reg, reg->missed_reg_data); - return 0; // FIXME: check for errs in this function + while ( (r = bcf_sr_regions_next(reg)) == 0 ) + reg->missed_reg_handler(reg, reg->missed_reg_data); + return r > -2 ? 0 : -1; }