44 # include <type_traits>
50 # include <htslib/kstring.h>
51 # include <htslib/tbx.h>
52 # include <htslib/vcf.h>
53 # include <htslib/vcfutils.h>
60 typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<char>>::value
61 || std::is_same<T, std::vector<int>>::value
62 || std::is_same<T, std::vector<float>>::value,
67 typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<int>>::value
68 || std::is_same<T, std::vector<float>>::value,
72 using isInfoVector =
typename std::enable_if<std::is_same<T, std::vector<int>>::value
73 || std::is_same<T, std::vector<float>>::value,
77 using isScalar =
typename std::enable_if<std::is_same<T, int>::value || std::is_same<T, float>::value
78 || std::is_same<T, double>::value,
82 using isString =
typename std::enable_if<std::is_same<T, std::string>::value,
bool>::type;
85 using isValidGT =
typename std::enable_if<std::is_same<T, std::vector<bool>>::value
86 || std::is_same<T, std::vector<char>>::value,
90 using isFormatVector =
typename std::enable_if<std::is_same<T, std::vector<float>>::value
91 || std::is_same<T, std::vector<char>>::value
92 || std::is_same<T, std::vector<int>>::value,
96 isScalar<T> isMissing(T
const & v)
98 if(std::is_same<T, float>::value)
100 return bcf_float_is_missing(v);
102 else if(std::is_same<T, int>::value)
104 return bcf_int32_missing(v);
109 inline bool isEndWith(std::string
const & s, std::string
const & e)
111 if(s.length() >= e.length())
113 return (0 == s.compare(s.length() - e.length(), e.length(), e));
122 inline std::string getMode(std::string
const & fname, std::string mode =
"r")
124 if(isEndWith(fname,
"bcf.gz")) mode +=
"b";
125 if(isEndWith(fname,
"bcf")) mode +=
"bu";
126 if(isEndWith(fname,
"vcf.gz")) mode +=
"z";
131 inline std::vector<std::string> split_string(
const std::string & s,
const std::string & separators)
133 std::vector<std::string> ret;
134 bool is_seperator[256] = {
false};
135 for(
auto & ch : separators)
137 is_seperator[(
unsigned int)ch] =
true;
140 for(
int i = 0; i <= (int)s.size(); i++)
142 if(is_seperator[(uint8_t)s[i]] || i == (int)s.size())
144 ret.push_back(std::string(s.begin() + begin, s.begin() + i));
154 void operator()(htsFile * x)
163 void operator()(bcf1_t * x)
165 if(x) bcf_destroy(x);
172 void operator()(bcf_hdr_t * x)
174 if(x) bcf_hdr_destroy(x);
190 bcf_hdr_t * hdr = NULL;
191 bcf_hrec_t * hrec = NULL;
198 if(hrec) bcf_hrec_destroy(hrec);
199 if(hdr) bcf_hdr_destroy(hdr);
218 const std::string & number,
219 const std::string & type,
220 const std::string & description)
222 addLine(
"##INFO=<ID=" +
id +
",Number=" + number +
",Type=" + type +
",Description=\"" + description
233 const std::string & number,
234 const std::string & type,
235 const std::string & description)
237 addLine(
"##FORMAT=<ID=" +
id +
",Number=" + number +
",Type=" + type +
",Description=\"" + description
246 inline void addFILTER(
const std::string &
id,
const std::string & description)
248 addLine(
"##FILTER=<ID=" +
id +
",Description=\"" + description +
"\">");
256 addLine(
"##contig=<ID=" +
id +
">");
265 ret = bcf_hdr_append(hdr, str.c_str());
266 if(ret != 0)
throw std::runtime_error(
"could not add " + str +
" to header\n");
267 ret = bcf_hdr_sync(hdr);
268 if(ret != 0)
throw std::runtime_error(
"could not add " + str +
" to header\n");
274 bcf_hdr_add_sample(hdr, sample.c_str());
275 if(bcf_hdr_sync(hdr) != 0)
277 throw std::runtime_error(
"couldn't add the sample.\n");
284 kstring_t s = {0, 0, NULL};
285 if(bcf_hdr_format(hdr, 0, &s) == 0)
287 std::string out(s.s, s.l);
293 throw std::runtime_error(
"failed to convert formatted header to string");
300 std::vector<std::string> vec;
301 for(
int i = 0; i < bcf_hdr_nsamples(hdr); i++)
303 vec.push_back(std::string(hdr->samples[i]));
312 const char ** seqs = bcf_hdr_seqnames(hdr, &ret);
313 if(ret == 0) printf(
"there is no contig id in the header!\n");
314 std::vector<std::string> vec;
315 for(
int i = 0; i < ret; i++)
317 vec.push_back(std::string(seqs[i]));
330 int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag.c_str());
331 if(tag_id < 0)
return 0;
332 if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
336 else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
340 else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
353 bcf_hdr_remove(hdr, BCF_HL_CTG, tag.c_str());
359 bcf_hdr_remove(hdr, BCF_HL_INFO, tag.c_str());
365 bcf_hdr_remove(hdr, BCF_HL_FMT, tag.c_str());
371 bcf_hdr_remove(hdr, BCF_HL_FLT, tag.c_str());
381 ret = bcf_hdr_set_samples(hdr, samples.c_str(), 0);
384 throw std::runtime_error(
"the " + std::to_string(ret)
385 +
"-th sample are not in the VCF.\nparameter samples:" + samples);
392 bcf_hdr_set_version(hdr, version.c_str());
398 return bcf_hdr_nsamples(hdr);
415 std::shared_ptr<bcf1_t> line = std::shared_ptr<bcf1_t>(bcf_init(),
variant_close());
416 bcf_hdr_t * hdr_d = NULL;
417 bcf_fmt_t * fmt = NULL;
418 bcf_info_t * info = NULL;
419 int32_t * gts = NULL;
420 int ndst, ret, nsamples;
421 bool noneMissing =
true;
422 bool isAllPhased =
false;
443 if(hdr_d) bcf_hdr_destroy(hdr_d);
450 if(!header->hdr)
throw std::runtime_error(
"please initial header first\n");
472 kstring_t s = {0, 0, NULL};
473 if(vcf_format(header->hdr, line.get(), &s) == 0)
475 std::string out(s.s, s.l);
481 throw std::runtime_error(
"couldn't format current record into a string.\n");
497 ret = bcf_get_genotypes(header->hdr, line.get(), >s, &ndst);
500 # if defined(VERBOSE)
501 std::cerr <<
"GT not present for current site. did you initilize the variant object?\n";
508 if(ret != nploidy * nsamples && nploidy > 0)
512 v.resize(nploidy * nsamples);
517 nploidy = ret / nsamples;
521 int i, j, nphased = 0;
523 fmt = bcf_get_fmt(header->hdr, line.get(),
"GT");
524 int nploidy_cur = ret / nsamples;
525 for(i = 0; i < nsamples; i++)
528 typeOfGT[i] = bcf_gt_type(fmt, i, NULL, NULL);
533 v[i * nploidy + 0] = 1;
534 for(j = 1; j < nploidy_cur; j++) v[i * nploidy + j] = 0;
538 for(j = 0; j < nploidy_cur; j++)
541 v[i * nploidy + j] = bcf_gt_allele(gts[j + i * nploidy_cur]) != 0;
545 gtPhase[i] = (gts[1 + i * nploidy_cur] & 1) == 1;
549 if(nphased == nsamples)
570 ret = bcf_get_genotypes(header->hdr, line.get(), >s, &ndst);
573 # if defined(VERBOSE)
574 std::cerr <<
"GT not present for current site. did you initilize the variant object?\n";
580 nploidy = ret / nsamples;
581 int i, j, nphased = 0;
583 for(i = 0; i < nsamples; i++)
585 int32_t * ptr = gts + i * nploidy;
587 for(j = 0; j < nploidy; j++)
590 if(ptr[j] == bcf_int32_vector_end)
break;
592 if(bcf_gt_is_missing(ptr[j]))
596 v[i * nploidy + j] = -9;
599 v[i * nploidy + j] = bcf_gt_allele(ptr[j]);
600 is_phased += bcf_gt_is_phased(ptr[j]);
602 if(nploidy == is_phased)
608 if(nphased == nsamples)
625 template<
typename T,
typename S =
typename T::value_type>
628 fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str());
631 throw std::invalid_argument(
"no FORMAT=" + tag +
" in the VCF header.\n");
639 ret = bcf_get_format_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
643 ret = bcf_get_format_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
647 ret = bcf_get_format_char(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
657 v = std::vector<S>(dst, dst + ret);
674 bool getFORMAT(std::string tag, std::vector<std::string> & v)
676 fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str());
679 throw std::invalid_argument(
"no FORMAT=" + tag +
" in the VCF header.\n");
686 ret = bcf_get_format_string(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
690 for(
int i = 0; i < nsamples; i++)
693 v.emplace_back(dst[i]);
713 template<
typename T,
typename S =
typename T::value_type>
714 isInfoVector<T>
getINFO(std::string tag, T & v)
716 info = bcf_get_info(header->hdr, line.get(), tag.c_str());
719 throw std::invalid_argument(
"no INFO=" + tag +
" in the VCF header.\n");
724 if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
726 ret = bcf_get_info_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
728 else if(info->type == BCF_BT_FLOAT)
730 ret = bcf_get_info_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
734 v = std::vector<S>(dst, dst + ret);
754 info = bcf_get_info(header->hdr, line.get(), tag.c_str());
757 throw std::invalid_argument(
"no INFO=" + tag +
" in the VCF header.\n");
762 if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
766 else if(info->type == BCF_BT_FLOAT)
774 # if defined(VERBOSE)
775 std::cerr <<
"there are multiple values for " + tag
776 +
" in INFO for current site. please use vector instead\n";
791 info = bcf_get_info(header->hdr, line.get(), tag.c_str());
794 throw std::invalid_argument(
"no INFO=" + tag +
" in the VCF header.\n");
796 if(info->type == BCF_BT_CHAR)
798 v = std::string(
reinterpret_cast<char *
>(info->vptr), info->vptr_len);
814 isScalar<T>
setINFO(std::string tag,
const T & v)
818 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
819 if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
821 ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), &v, 1);
823 else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
825 float v2 =
static_cast<float>(v);
826 ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), &v2, 1);
834 # if defined(VERBOSE)
835 std::cerr <<
"couldn't set " + tag +
" for this variant.\nplease add the tag in headerfirst.\n";
849 isValidInfo<T>
setINFO(std::string tag,
const T & v)
852 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
853 if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
855 ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
857 else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
859 ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
861 else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
863 ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), v.data());
872 # if defined(VERBOSE)
873 std::cerr <<
"couldn't set " + tag +
" for this variant.\nplease add the tag in headerfirst.\n";
883 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
884 if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
886 ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), NULL, 0);
888 else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
890 ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), NULL, 0);
892 else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
894 ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), NULL);
903 throw std::runtime_error(
"couldn't remove " + tag +
" for this variant.\n");
916 nploidy = v.size() / nsamples;
917 int32_t * gt = (int32_t *)malloc(v.size() *
sizeof(int32_t));
918 for(i = 0; i < nsamples; i++)
920 for(j = 0; j < nploidy; j++)
923 if(v[k] == -9 || v[k] == bcf_int32_missing)
925 gt[k] = bcf_gt_missing;
929 gt[k] = bcf_gt_phased(v[k]);
933 gt[k] = bcf_gt_unphased(v[k]);
937 if(bcf_update_genotypes(header->hdr, line.get(), gt, v.size()) < 0)
940 # if defined(VERBOSE)
941 std::cerr <<
"couldn't set genotypes correctly.\n";
955 assert((
int)v.size() == nsamples);
963 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
964 if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
966 ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), NULL, 0);
968 else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
970 ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), NULL, 0);
972 else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
974 ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), NULL, 0);
977 if(ret < 0)
throw std::runtime_error(
"couldn't remove " + tag +
" correctly.\n");
989 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
990 if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
992 ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
994 else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
996 ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
998 else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1000 ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
1009 # if defined(VERBOSE)
1010 std::cerr <<
"couldn't set format " + tag +
" corectly.\n";
1024 template<
typename T>
1028 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
1029 if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1031 ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), &v, 1);
1033 else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1035 ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), &v2, 1);
1043 # if defined(VERBOSE)
1044 std::cerr <<
"couldn't set format " + tag +
" corectly.\n";
1054 kstring_t s = {0, 0, NULL};
1055 kputsn(vcfline.c_str(), vcfline.length(), &s);
1056 ret = vcf_parse1(&s, header->hdr, line.get());
1058 if(ret > 0)
throw std::runtime_error(
"error parsing: " + vcfline +
"\n");
1059 if(line->errcode == BCF_ERR_CTG_UNDEF)
1061 std::string contig(bcf_hdr_id2name(header->hdr, line->rid));
1062 hdr_d = bcf_hdr_dup(header->hdr);
1063 header->hrec = bcf_hdr_id2hrec(hdr_d, BCF_DT_CTG, 0, line->rid);
1064 if(header->hrec == NULL)
1065 throw std::runtime_error(
"contig" + contig +
" unknow and not found in the header.\n");
1066 ret = bcf_hdr_add_hrec(header->hdr, header->hrec);
1067 if(ret < 0)
throw std::runtime_error(
"error adding contig " + contig +
" to header.\n");
1068 ret = bcf_hdr_sync(header->hdr);
1081 if(bcf_get_info(header->hdr, line.get(),
"SVTYPE") == NULL)
1095 if(
REF().length() > 1 && !
isSV())
return true;
1096 for(
int i = 1; i < line->n_allele; i++)
1098 std::string alt(line->d.allele[i]);
1099 if(alt ==
".")
return true;
1100 if(alt.length() !=
REF().length() && !
isSV())
return true;
1108 if(line->n_allele <= 2)
return false;
1116 if(
REF().length() > 1 || line->n_allele <= 2)
return false;
1117 for(
int i = 1; i < line->n_allele; i++)
1119 std::string snp(line->d.allele[i]);
1120 if(snp.length() != 1)
1133 if(
REF().length() > 1 || line->n_allele > 2)
return false;
1134 std::string snp(line->d.allele[1]);
1135 if(!(snp ==
"A" || snp ==
"C" || snp ==
"G" || snp ==
"T"))
1146 int type = bcf_has_variant_types(line.get(), VCF_SNP, bcf_match_overlap);
1147 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1148 if(type == 0)
return false;
1155 int type = bcf_has_variant_types(line.get(), VCF_INDEL, bcf_match_overlap);
1156 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1157 if(type == 0)
return false;
1164 int type = bcf_has_variant_types(line.get(), VCF_INS, bcf_match_overlap);
1165 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1166 if(type == 0)
return false;
1173 int type = bcf_has_variant_types(line.get(), VCF_DEL, bcf_match_overlap);
1174 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1175 if(type == 0)
return false;
1182 int type = bcf_has_variant_types(line.get(), VCF_MNP, bcf_match_overlap);
1183 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1184 if(type == 0)
return false;
1191 int type = bcf_has_variant_types(line.get(), VCF_BND, bcf_match_overlap);
1192 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1193 if(type == 0)
return false;
1200 int type = bcf_has_variant_types(line.get(), VCF_OTHER, bcf_match_overlap);
1201 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1202 if(type == 0)
return false;
1209 int type = bcf_has_variant_types(line.get(), VCF_OVERLAP, bcf_match_overlap);
1210 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1211 if(type == 0)
return false;
1218 return std::string(bcf_hdr_id2name(header->hdr, line->rid));
1222 inline std::string
ID()
const
1224 return std::string(line->d.id);
1230 return line->pos + 1;
1236 line->rid = bcf_hdr_name2id(header->hdr, s.c_str());
1246 inline void setID(
const std::string & s)
1248 bcf_update_id(header->hdr, line.get(), s.c_str());
1254 bcf_update_alleles_str(header->hdr, line.get(), s.c_str());
1266 bcf_float_set_missing(line->qual);
1272 int32_t tmpi = bcf_hdr_id2int(header->hdr, BCF_DT_ID, s.c_str());
1273 bcf_add_filter(header->hdr, line.get(), tmpi);
1285 return line->pos + line->rlen;
1289 inline std::string
REF()
const
1291 return std::string(line->d.allele[0]);
1301 inline std::string
ALT()
const
1304 for(
int i = 1; i < line->n_allele; i++)
1306 s += std::string(line->d.allele[i]) +
",";
1308 if(s.length() > 1) s.pop_back();
1315 if(bcf_float_is_missing(line->qual))
1317 noneMissing =
false;
1318 return bcf_float_missing;
1329 if(line->d.n_flt == 0)
1333 else if(line->d.n_flt == 1)
1335 return std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[0]));
1340 for(
int i = 1; i < line->d.n_flt; i++)
1342 s += std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[i])) +
",";
1352 int32_t max_dt_id = header->hdr->n[BCF_DT_ID];
1353 kstring_t * s = (kstring_t *)calloc(1,
sizeof(kstring_t));
1357 for(
int i = 0; i < line->n_info; ++i)
1359 bcf_info_t * z = &line->d.info[i];
1360 if(!z->vptr)
continue;
1361 if(!first) kputc(
';', s);
1363 if(z->key < 0 || z->key >= max_dt_id || header->hdr->id[BCF_DT_ID][z->key].key == NULL)
1364 throw std::runtime_error(
"Invalid BCF and wrong INFO tag");
1365 kputs(header->hdr->id[BCF_DT_ID][z->key].key, s);
1366 if(z->len <= 0)
continue;
1373 if(z->v1.i == bcf_int8_missing)
1379 if(z->v1.i == bcf_int16_missing)
1385 if(z->v1.i == bcf_int32_missing)
1391 if(z->v1.i == bcf_int64_missing)
1397 if(bcf_float_is_missing(z->v1.f))
1406 throw std::runtime_error(
"Unexpected type in INFO");
1410 bcf_fmt_array(s, z->len, z->type, z->vptr);
1412 if(first) kputc(
'.', s);
1416 std::string out = std::string(s->s, s->l);
1464 std::shared_ptr<htsFile> fp;
1465 hts_idx_t * hidx = NULL;
1466 tbx_t * tidx = NULL;
1467 hts_itr_t * itr = NULL;
1468 kstring_t s = {0, 0, NULL};
1497 BcfReader(
const std::string & file,
const std::string & region) : fname(file)
1514 BcfReader(
const std::string & file,
const std::string & region,
const std::string & samples) : fname(file)
1530 if(itr) hts_itr_destroy(itr);
1531 if(hidx) hts_idx_destroy(hidx);
1532 if(tidx) tbx_destroy(tidx);
1537 void open(
const std::string & file)
1540 fp = std::shared_ptr<htsFile>(hts_open(file.c_str(),
"r"),
htsFile_close());
1541 if(!fp)
throw std::invalid_argument(
"I/O error: input file is invalid");
1542 header.hdr = bcf_hdr_read(fp.get());
1550 return hts_set_threads(fp.get(), n);
1589 if(isEndWith(fname,
"bcf") || isEndWith(fname,
"bcf.gz"))
1592 hidx = bcf_index_load(fname.c_str());
1593 if(itr) bcf_itr_destroy(itr);
1595 itr = bcf_itr_querys(hidx,
header.hdr,
".");
1597 itr = bcf_itr_querys(hidx,
header.hdr, region.c_str());
1602 tidx = tbx_index_load(fname.c_str());
1603 assert(tidx != NULL &&
"error loading tabix index!");
1604 if(itr) tbx_itr_destroy(itr);
1606 itr = tbx_itr_querys(tidx,
".");
1608 itr = tbx_itr_querys(tidx, region.c_str());
1609 assert(itr != NULL &&
"no interval region found.failed!");
1622 ret = bcf_itr_next(fp.get(), itr, r.line.get());
1623 bcf_unpack(r.line.get(), BCF_UN_ALL);
1628 int slen = tbx_itr_next(fp.get(), tidx, itr, &s);
1631 ret = vcf_parse1(&s, r.header->hdr, r.line.get());
1632 bcf_unpack(r.line.get(), BCF_UN_ALL);
1634 return (ret <= 0) && (slen > 0);
1639 ret = bcf_read(fp.get(), r.header->hdr, r.line.get());
1641 bcf_unpack(r.line.get(), BCF_UN_ALL);
1654 std::shared_ptr<htsFile> fp;
1655 std::shared_ptr<bcf1_t> b = std::shared_ptr<bcf1_t>(bcf_init(),
variant_close());
1657 bool isHeaderWritten =
false;
1672 BcfWriter(
const std::string & fname, std::string version =
"VCF4.1")
1700 BcfWriter(
const std::string & fname,
const std::string & version,
const std::string & mode)
1729 void open(
const std::string & fname)
1731 auto mode = getMode(fname,
"w");
1732 fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()),
htsFile_close());
1733 if(!fp)
throw std::invalid_argument(
"I/O error: input file is invalid");
1745 void open(
const std::string & fname,
const std::string & mode)
1747 fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()),
htsFile_close());
1748 if(!fp)
throw std::invalid_argument(
"I/O error: input file is invalid");
1762 header.hdr = bcf_hdr_init(
"w");
1773 void copyHeader(
const std::string & vcffile, std::string samples =
"-")
1775 htsFile * fp2 = hts_open(vcffile.c_str(),
"r");
1776 if(!fp2)
throw std::invalid_argument(
"I/O error: input file is invalid");
1779 bcf_hdr_t * hfull = bcf_hdr_read(fp2);
1780 header.hdr = bcf_hdr_subset(hfull, 0, 0, 0);
1781 bcf_hdr_remove(
header.hdr, BCF_HL_FMT, NULL);
1782 bcf_hdr_destroy(hfull);
1786 header.hdr = bcf_hdr_read(fp2);
1796 if(!isHeaderWritten && !
writeHeader())
throw std::runtime_error(
"could not write header\n");
1797 kstring_t s = {0, 0, NULL};
1798 kputsn(vcfline.c_str(), vcfline.length(), &s);
1799 ret = vcf_parse1(&s, hp->hdr, b.get());
1801 if(ret > 0)
throw std::runtime_error(
"error parsing: " + vcfline +
"\n");
1802 if(b->errcode == BCF_ERR_CTG_UNDEF)
1804 throw std::runtime_error(
"contig id " + (std::string)bcf_hdr_id2name(hp->hdr, b->rid)
1805 +
" not found in the header. please run header->AddContig() first.\n");
1807 ret = bcf_write(fp.get(), hp->hdr, b.get());
1808 if(ret != 0)
throw std::runtime_error(
"error writing: " + vcfline +
"\n");
1814 ret = bcf_hdr_write(fp.get(), hp->hdr);
1815 if(ret == 0)
return isHeaderWritten =
true;
1823 if(bcf_write(fp.get(), v.header->hdr, v.line.get()) < 0)
return false;
Stream in variants from compressed/uncompressed VCF/BCF file or stdin.
Definition: vcfpp.h:1462
void setSamples(const std::string &samples)
explicitly stream to specific samples
Definition: vcfpp.h:1573
void open(const std::string &file)
open a VCF/BCF/STDIN file for streaming in
Definition: vcfpp.h:1537
BcfReader(const std::string &file, const std::string ®ion, const std::string &samples)
construct a vcf/bcf reader with subset samples in target region
Definition: vcfpp.h:1514
BcfHeader header
a BcfHeader object
Definition: vcfpp.h:1474
int setThreads(int n)
set the number of threads to use
Definition: vcfpp.h:1548
uint64_t getVariantsCount(BcfRecord &r, const std::string ®ion)
get the number of records of given region
Definition: vcfpp.h:1560
BcfReader(const std::string &file, const std::string ®ion)
construct a vcf/bcf reader with subset samples
Definition: vcfpp.h:1497
bool getNextVariant(BcfRecord &r)
read in the next variant
Definition: vcfpp.h:1615
BcfReader()
Construct an empty BcfReader.
Definition: vcfpp.h:1481
int nsamples
number of samples in the VCF
Definition: vcfpp.h:1476
const BcfHeader & getHeader() const
return a BcfHeader object
Definition: vcfpp.h:1554
BcfReader(const std::string &file)
construct a vcf/bcf reader from file.
Definition: vcfpp.h:1487
void close()
close the BcfReader object.
Definition: vcfpp.h:1527
void setRegion(const std::string ®ion)
explicitly stream to specific region
Definition: vcfpp.h:1584
std::vector< std::string > SamplesName
number of samples in the VCF
Definition: vcfpp.h:1478
Object represents a variant record in the VCF, offering methods to access and modify fields.
Definition: vcfpp.h:409
float QUAL()
return QUAL value
Definition: vcfpp.h:1313
bool isIndel() const
return boolean value indicates if current variant is exclusively INDEL
Definition: vcfpp.h:1092
bool hasINDEL() const
return boolean value indicates if current variant has INDEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1153
void setRefAlt(const std::string &s)
set REF and ALT alleles given a string seperated by comma
Definition: vcfpp.h:1252
isValidGT< T > getGenotypes(T &v)
fill in the input vector with genotypes of 0 and 1. only works for ploidy<=2. Genotypes with missing ...
Definition: vcfpp.h:494
void setPOS(int64_t p)
modify position given 1-based value
Definition: vcfpp.h:1240
bool hasINS() const
return boolean value indicates if current variant has INS type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1162
BcfRecord()
empty constructor. call init() afterwards
Definition: vcfpp.h:432
std::string allINFO()
return raw INFO column as string. recommend to use getINFO for specific tag.
Definition: vcfpp.h:1350
int ploidy() const
return the number of ploidy of current variant
Definition: vcfpp.h:1429
void removeINFO(std::string tag)
remove the given tag from INFO of the variant
Definition: vcfpp.h:881
BcfRecord(BcfHeader &h)
constructor with a given BcfHeader object
Definition: vcfpp.h:435
std::string CHROM() const
return CHROM name
Definition: vcfpp.h:1216
void setID(const std::string &s)
update ID
Definition: vcfpp.h:1246
isValidFMT< T > setFORMAT(std::string tag, const T &v)
set tag values for all samples in FORMAT using given vector
Definition: vcfpp.h:987
int64_t POS() const
return 1-base position
Definition: vcfpp.h:1228
std::string ID() const
return ID field
Definition: vcfpp.h:1222
isScalar< T > setFORMAT(std::string tag, const T &v)
set tag for a single sample in FORMAT using given singular value. this works only when there is one s...
Definition: vcfpp.h:1025
bool isSV() const
return boolean value indicates if current variant is Structual Variant or not
Definition: vcfpp.h:1079
void setQUAL(float q)
modify the QUAL value
Definition: vcfpp.h:1258
bool hasSNP() const
return boolean value indicates if current variant has SNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1144
std::vector< char > isGenoMissing
if there is "." in GT for the sample, then it's coded as missing (TRUE)
Definition: vcfpp.h:428
std::string FILTER()
return raw FILTER column as string
Definition: vcfpp.h:1327
void swap_REF_ALT()
swap REF and ALT for biallelic SNP
Definition: vcfpp.h:1295
std::string REF() const
return raw REF alleles as string
Definition: vcfpp.h:1289
std::string asString() const
return current variant as raw string
Definition: vcfpp.h:470
int64_t Start() const
return 0-base start of the variant (can be any type)
Definition: vcfpp.h:1277
bool hasBND() const
return boolean value indicates if current variant has BND type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1189
friend std::ostream & operator<<(std::ostream &out, const BcfRecord &v)
stream out the variant
Definition: vcfpp.h:463
isScalar< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:814
void resetHeader(BcfHeader &h)
reset the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:457
bool hasMNP() const
return boolean value indicates if current variant has MNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1180
bool allPhased() const
return boolean value indicates if genotypes of all samples are phased
Definition: vcfpp.h:1423
bool hasDEL() const
return boolean value indicates if current variant has DEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1171
bool isMultiAllelicSNP() const
return boolean value indicates if current variant is exclusively multiallelic SNP sites
Definition: vcfpp.h:1113
void setPhasing(const std::vector< char > &v)
set phasing status for all diploid samples using given vector
Definition: vcfpp.h:953
bool setGenotypes(const std::vector< int > &v)
set genotypes from scratch even if genotypes not present
Definition: vcfpp.h:912
std::string ALT() const
return raw ALT alleles as string
Definition: vcfpp.h:1301
bool hasOTHER() const
return boolean value indicates if current variant has OTHER type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1198
isString< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:789
bool isSNP() const
return boolean value indicates if current variant is exclusively biallelic SNP. Note ALT=* are skippe...
Definition: vcfpp.h:1130
bool isMultiAllelics() const
return boolean value indicates if current variant is multiallelic sites
Definition: vcfpp.h:1106
void setCHR(const std::string &s)
modify CHROM value
Definition: vcfpp.h:1234
int64_t End() const
return 0-base end of the variant (can be any type)
Definition: vcfpp.h:1283
void setPloidy(int v)
in a rare case, one may want to set the number of ploidy manually
Definition: vcfpp.h:1435
isFormatVector< T > getFORMAT(std::string tag, T &v)
get tag value in FORMAT
Definition: vcfpp.h:626
void initHeader(BcfHeader &h)
initilize the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:447
void addLineFromString(const std::string &vcfline)
add one variant record from given string
Definition: vcfpp.h:1052
bool getFORMAT(std::string tag, std::vector< std::string > &v)
get tag value in FORMAT
Definition: vcfpp.h:674
bool isNoneMissing() const
if all samples have non missing values for the tag in FORMAT
Definition: vcfpp.h:1073
void setQUAL(char q)
modify the QUAL value
Definition: vcfpp.h:1264
void setFILTER(const std::string &s)
modify the FILTER value
Definition: vcfpp.h:1270
isValidInfo< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:849
std::vector< char > gtPhase
vector of nsamples length. keep track of the phasing status of each sample
Definition: vcfpp.h:1454
void removeFORMAT(std::string tag)
remove the given tag from FORMAT of the variant
Definition: vcfpp.h:960
std::vector< char > typeOfGT
vector of nsamples length. keep track of the type of genotype (one of GT_HOM_RR, GT_HET_RA,...
Definition: vcfpp.h:1451
isInfoVector< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:714
bool getGenotypes(std::vector< int > &v)
fill in the input vector with genotype values, 0, 1 or -9 (missing).
Definition: vcfpp.h:567
bool hasOVERLAP() const
return boolean value indicates if current variant has OVERLAP type defined in vcf....
Definition: vcfpp.h:1207
isScalar< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:752
Stream out variants to compressed/uncompressed VCF/BCF file or stdout.
Definition: vcfpp.h:1652
void close()
close the BcfWriter object.
Definition: vcfpp.h:1752
void writeLine(const std::string &vcfline)
copy a string to a vcf line
Definition: vcfpp.h:1794
BcfWriter(const std::string &fname, const std::string &version, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1700
void copyHeader(const std::string &vcffile, std::string samples="-")
copy header of given VCF and restrict on samples. if samples=="", FORMAT removed and only sites left
Definition: vcfpp.h:1773
BcfWriter(const std::string &fname, std::string version="VCF4.1")
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1672
void open(const std::string &fname)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1729
BcfWriter(const std::string &fname, const BcfHeader &h)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1684
void initalHeader(const BcfHeader &h)
initial a VCF header by pointing to header of another VCF
Definition: vcfpp.h:1767
BcfWriter(const std::string &fname, const BcfHeader &h, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1717
void initalHeader(std::string version="VCF4.1")
initial a VCF header using the internal template given a specific version. VCF4.1 is the default
Definition: vcfpp.h:1760
BcfHeader header
header object initialized by initalHeader
Definition: vcfpp.h:1662
BcfWriter()
Construct an empty BcfWriter.
Definition: vcfpp.h:1665
void open(const std::string &fname, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1745
bool writeRecord(BcfRecord &v)
streams out the given variant of BcfRecord type
Definition: vcfpp.h:1820
bool writeHeader()
streams out the header
Definition: vcfpp.h:1812