46 # include <type_traits>
52 # include <htslib/hts.h>
53 # include <htslib/kstring.h>
54 # include <htslib/tbx.h>
55 # include <htslib/vcf.h>
56 # include <htslib/vcfutils.h>
63 typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<char>>::value
64 || std::is_same<T, std::vector<int>>::value
65 || std::is_same<T, std::vector<float>>::value,
70 typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<int>>::value
71 || std::is_same<T, std::vector<float>>::value,
75 using isInfoVector =
typename std::enable_if<std::is_same<T, std::vector<int>>::value
76 || std::is_same<T, std::vector<float>>::value,
80 using isScalar =
typename std::enable_if<std::is_same<T, int>::value || std::is_same<T, float>::value
81 || std::is_same<T, double>::value,
85 using isString =
typename std::enable_if<std::is_same<T, std::string>::value,
bool>::type;
88 using isValidGT =
typename std::enable_if<std::is_same<T, std::vector<bool>>::value
89 || std::is_same<T, std::vector<char>>::value,
93 using isFormatVector =
typename std::enable_if<std::is_same<T, std::vector<float>>::value
94 || std::is_same<T, std::vector<char>>::value
95 || std::is_same<T, std::vector<int>>::value,
102 isScalar<T> isMissing(T
const & v)
104 if(std::is_same<T, float>::value)
106 return bcf_float_is_missing(v);
108 else if(std::is_same<T, int>::value)
110 return bcf_int32_missing(v);
115 inline bool isEndWith(std::string
const & s, std::string
const & e)
117 if(s.length() >= e.length())
119 return (0 == s.compare(s.length() - e.length(), e.length(), e));
128 inline std::string getMode(std::string
const & fname, std::string mode =
"r")
130 if(isEndWith(fname,
"bcf.gz")) mode +=
"b";
131 if(isEndWith(fname,
"bcf")) mode +=
"bu";
132 if(isEndWith(fname,
"vcf.gz")) mode +=
"z";
137 inline std::vector<std::string> split_string(
const std::string & s,
const std::string & separators)
139 std::vector<std::string> ret;
140 bool is_seperator[256] = {
false};
141 for(
auto & ch : separators)
143 is_seperator[(
unsigned int)ch] =
true;
146 for(
int i = 0; i <= (int)s.size(); i++)
148 if(is_seperator[(uint8_t)s[i]] || i == (int)s.size())
150 ret.emplace_back(s.begin() + begin, s.begin() + i);
160 void operator()(htsFile * x)
169 void operator()(hts_itr_t * x)
171 if(x) hts_itr_destroy(x);
178 void operator()(hts_idx_t * x)
180 if(x) hts_idx_destroy(x);
187 void operator()(tbx_t * x)
189 if(x) tbx_destroy(x);
196 void operator()(bcf1_t * x)
198 if(x) bcf_destroy(x);
205 void operator()(bcf_hdr_t * x)
207 if(x) bcf_hdr_destroy(x);
225 std::shared_ptr<bcf_hdr_t> hdr;
235 auto h = bcf_hdr_init(
"w");
238 bcf_hdr_set_version(hdr.get(), version.c_str());
269 const std::string & number,
270 const std::string & type,
271 const std::string & description)
273 addLine(
"##INFO=<ID=" +
id +
",Number=" + number +
",Type=" + type +
",Description=\"" + description
284 const std::string & number,
285 const std::string & type,
286 const std::string & description)
288 addLine(
"##FORMAT=<ID=" +
id +
",Number=" + number +
",Type=" + type +
",Description=\"" + description
297 inline void addFILTER(
const std::string &
id,
const std::string & description)
299 addLine(
"##FILTER=<ID=" +
id +
",Description=\"" + description +
"\">");
307 addLine(
"##contig=<ID=" +
id +
">");
316 ret = bcf_hdr_append(hdr.get(), str.c_str());
317 if(ret != 0)
throw std::runtime_error(
"could not add " + str +
" to header\n");
318 ret = bcf_hdr_sync(hdr.get());
319 if(ret != 0)
throw std::runtime_error(
"could not add " + str +
" to header\n");
325 bcf_hdr_add_sample(hdr.get(), sample.c_str());
326 if(bcf_hdr_sync(hdr.get()) != 0)
328 throw std::runtime_error(
"couldn't add the sample.\n");
335 kstring_t s = {0, 0,
nullptr};
336 if(bcf_hdr_format(hdr.get(), 0, &s) == 0)
338 std::string out(s.s, s.l);
344 throw std::runtime_error(
"failed to convert formatted header to string");
351 std::vector<std::string> vec;
352 for(
int i = 0; i < bcf_hdr_nsamples(hdr); i++)
354 vec.emplace_back(hdr->samples[i]);
363 const char ** seqs = bcf_hdr_seqnames(hdr.get(), &ret);
364 if(ret == 0) printf(
"there is no contig id in the header!\n");
365 std::vector<std::string> vec;
366 for(
int i = 0; i < ret; i++)
368 vec.emplace_back(seqs[i]);
381 int tag_id = bcf_hdr_id2int(hdr.get(), BCF_DT_ID, tag.c_str());
382 if(tag_id < 0)
return 0;
383 if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
387 else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
391 else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
408 int tag_id = bcf_hdr_id2int(hdr.get(), BCF_DT_ID, tag.c_str());
409 if(!bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, tag_id))
return -1;
410 if(bcf_hdr_id2type(hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
414 else if(bcf_hdr_id2type(hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
418 else if(bcf_hdr_id2type(hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
431 bcf_hdr_remove(hdr.get(), BCF_HL_CTG, tag.c_str());
437 bcf_hdr_remove(hdr.get(), BCF_HL_INFO, tag.c_str());
443 bcf_hdr_remove(hdr.get(), BCF_HL_FMT, tag.c_str());
449 bcf_hdr_remove(hdr.get(), BCF_HL_FLT, tag.c_str());
459 auto ss = details::split_string(samples,
",");
461 if(nsamples != (
int)ss.size())
462 throw std::runtime_error(
"You provide either too few or too many samples");
463 kstring_t htxt = {0, 0,
nullptr};
464 bcf_hdr_format(hdr.get(), 1, &htxt);
466 int i = htxt.l - 2, ncols = 0;
467 while(i >= 0 && htxt.s[i] !=
'\n')
469 if(htxt.s[i] ==
'\t') ncols++;
472 if(i < 0 || strncmp(htxt.s + i + 1,
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", 45))
474 if(i > 0 && !strncmp(htxt.s + i + 1,
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", 38))
475 throw std::runtime_error(
"Error: missing FORMAT fields, cowardly refusing to add samples\n");
476 throw std::runtime_error(
"Could not parse the header: " + std::string(htxt.s));
483 if(htxt.s[i] ==
'\t') ncols++;
488 for(i = 0; i < nsamples; i++)
491 kputs(ss[i].c_str(), &htxt);
495 auto h = bcf_hdr_init(
"r");
496 if(bcf_hdr_parse(h, htxt.s) < 0)
497 throw std::runtime_error(
"An error occurred while parsing the header\n");
512 ret = bcf_hdr_set_samples(hdr.get(), samples.c_str(), 0);
515 throw std::runtime_error(
"the " + std::to_string(ret)
516 +
"-th sample are not in the VCF.\nparameter samples:" + samples);
523 bcf_hdr_set_version(hdr.get(), version.c_str());
529 return bcf_hdr_nsamples(hdr);
547 int32_t * gts =
nullptr;
548 int32_t ndst, ret, nsamples;
549 bool noneMissing =
true;
550 bool isAllPhased =
false;
577 if(!header->hdr)
throw std::runtime_error(
"please initial header first\n");
599 kstring_t s = {0, 0,
nullptr};
600 if(vcf_format(header->hdr.get(), line.get(), &s) == 0)
602 std::string out(s.s, s.l);
608 throw std::runtime_error(
"couldn't format current record into a string.\n");
624 ret = bcf_get_genotypes(header->hdr.get(), line.get(), >s, &ndst);
627 # if defined(VERBOSE)
628 std::cerr <<
"GT not present for current site. did you initilize the variant object?\n";
634 if(ret != nploidy * nsamples && nploidy > 0)
638 v.resize(nploidy * nsamples);
643 nploidy = ret / nsamples;
647 int i, j, nphased = 0;
649 bcf_fmt_t * fmt = bcf_get_fmt(header->hdr.get(), line.get(),
"GT");
650 int nploidy_cur = ret / nsamples;
651 for(i = 0; i < nsamples; i++)
654 typeOfGT[i] = bcf_gt_type(fmt, i,
nullptr,
nullptr);
659 v[i * nploidy + 0] = 1;
660 for(j = 1; j < nploidy_cur; j++) v[i * nploidy + j] = 0;
664 for(j = 0; j < nploidy_cur; j++)
667 v[i * nploidy + j] = bcf_gt_allele(gts[j + i * nploidy_cur]) != 0;
671 gtPhase[i] = (gts[1 + i * nploidy_cur] & 1) == 1;
675 if(nphased == nsamples)
696 ret = bcf_get_genotypes(header->hdr.get(), line.get(), >s, &ndst);
699 # if defined(VERBOSE)
700 std::cerr <<
"GT not present for current site. did you initilize the variant object?\n";
706 nploidy = ret / nsamples;
707 int i, j, nphased = 0;
709 for(i = 0; i < nsamples; i++)
711 int32_t * ptr = gts + i * nploidy;
713 for(j = 0; j < nploidy; j++)
716 if(ptr[j] == bcf_int32_vector_end)
break;
718 if(bcf_gt_is_missing(ptr[j]))
722 v[i * nploidy + j] = -9;
725 v[i * nploidy + j] = bcf_gt_allele(ptr[j]);
726 is_phased += bcf_gt_is_phased(ptr[j]);
728 if(nploidy == is_phased)
734 if(nphased == nsamples)
751 template<
typename T,
typename S =
typename T::value_type>
754 bcf_fmt_t * fmt = bcf_get_fmt(header->hdr.get(), line.get(), tag.c_str());
757 throw std::invalid_argument(
"no FORMAT=" + tag +
" in the VCF header.\n");
765 ret = bcf_get_format_int32(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
769 ret = bcf_get_format_float(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
773 ret = bcf_get_format_char(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
783 v = std::vector<S>(dst, dst + ret);
800 bool getFORMAT(std::string tag, std::vector<std::string> & v)
802 bcf_fmt_t * fmt = bcf_get_fmt(header->hdr.get(), line.get(), tag.c_str());
805 throw std::invalid_argument(
"no FORMAT=" + tag +
" in the VCF header.\n");
810 char ** dst =
nullptr;
812 ret = bcf_get_format_string(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
816 for(
int i = 0; i < nsamples; i++)
819 v.emplace_back(dst[i]);
839 template<
typename T,
typename S =
typename T::value_type>
840 isInfoVector<T>
getINFO(std::string tag, T & v)
842 bcf_info_t * info = bcf_get_info(header->hdr.get(), line.get(), tag.c_str());
845 throw std::invalid_argument(
"no INFO=" + tag +
" in the VCF header.\n");
850 if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
852 ret = bcf_get_info_int32(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
854 else if(info->type == BCF_BT_FLOAT)
856 ret = bcf_get_info_float(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
860 v = std::vector<S>(dst, dst + ret);
880 bcf_info_t * info = bcf_get_info(header->hdr.get(), line.get(), tag.c_str());
883 throw std::invalid_argument(
"no INFO=" + tag +
" in the VCF header.\n");
888 if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
892 else if(info->type == BCF_BT_FLOAT)
900 # if defined(VERBOSE)
901 std::cerr <<
"there are multiple values for " + tag
902 +
" in INFO for current site. please use vector instead\n";
917 bcf_info_t * info = bcf_get_info(header->hdr.get(), line.get(), tag.c_str());
920 throw std::invalid_argument(
"no INFO=" + tag +
" in the VCF header.\n");
922 if(info->type == BCF_BT_CHAR)
924 v = std::string(
reinterpret_cast<char *
>(info->vptr), info->vptr_len);
940 isScalar<T>
setINFO(std::string tag,
const T & v)
944 int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
945 if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
947 ret = bcf_update_info_int32(header->hdr.get(), line.get(), tag.c_str(), &v, 1);
949 else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
951 float v2 =
static_cast<float>(v);
952 ret = bcf_update_info_float(header->hdr.get(), line.get(), tag.c_str(), &v2, 1);
960 # if defined(VERBOSE)
961 std::cerr <<
"couldn't set " + tag +
" for this variant.\nplease add the tag in headerfirst.\n";
975 isValidInfo<T>
setINFO(std::string tag,
const T & v)
978 int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
979 if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
981 ret = bcf_update_info_int32(header->hdr.get(), line.get(), tag.c_str(), v.data(), v.size());
983 else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
985 ret = bcf_update_info_float(header->hdr.get(), line.get(), tag.c_str(), v.data(), v.size());
987 else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
989 ret = bcf_update_info_string(header->hdr.get(), line.get(), tag.c_str(), v.data());
998 # if defined(VERBOSE)
999 std::cerr <<
"couldn't set " + tag +
" for this variant.\nplease add the tag in headerfirst.\n";
1009 int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
1010 if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
1012 ret = bcf_update_info_int32(header->hdr.get(), line.get(), tag.c_str(), NULL, 0);
1014 else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
1016 ret = bcf_update_info_float(header->hdr.get(), line.get(), tag.c_str(), NULL, 0);
1018 else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
1020 ret = bcf_update_info_string(header->hdr.get(), line.get(), tag.c_str(), NULL);
1029 throw std::runtime_error(
"couldn't remove " + tag +
" for this variant.\n");
1042 nploidy = v.size() / nsamples;
1043 int32_t * gt = (int32_t *)malloc(v.size() *
sizeof(int32_t));
1044 for(i = 0; i < nsamples; i++)
1046 for(j = 0; j < nploidy; j++)
1048 k = i * nploidy + j;
1049 if(v[k] == -9 || v[k] == bcf_int32_missing)
1051 gt[k] = bcf_gt_missing;
1055 gt[k] = bcf_gt_phased(v[k]);
1059 gt[k] = bcf_gt_unphased(v[k]);
1063 if(bcf_update_genotypes(header->hdr.get(), line.get(), gt, v.size()) < 0)
1066 # if defined(VERBOSE)
1067 std::cerr <<
"couldn't set genotypes correctly.\n";
1081 if((
int)v.size() != nsamples)
1082 throw std::runtime_error(
"the size of input vector is not matching the size of genotypes");
1090 int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
1091 if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1093 ret = bcf_update_format_int32(header->hdr.get(), line.get(), tag.c_str(), NULL, 0);
1095 else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
1097 ret = bcf_update_format_char(header->hdr.get(), line.get(), tag.c_str(), NULL, 0);
1099 else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1101 ret = bcf_update_format_float(header->hdr.get(), line.get(), tag.c_str(), NULL, 0);
1104 if(ret < 0)
throw std::runtime_error(
"couldn't remove " + tag +
" correctly.\n");
1113 template<
typename T>
1116 int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
1117 if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1119 ret = bcf_update_format_int32(header->hdr.get(), line.get(), tag.c_str(), v.data(), v.size());
1121 else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
1123 ret = bcf_update_format_char(header->hdr.get(), line.get(), tag.c_str(), v.data(), v.size());
1125 else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1127 ret = bcf_update_format_float(header->hdr.get(), line.get(), tag.c_str(), v.data(), v.size());
1136 # if defined(VERBOSE)
1137 std::cerr <<
"couldn't set format " + tag +
" corectly.\n";
1151 template<
typename T>
1155 int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
1156 if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1158 ret = bcf_update_format_int32(header->hdr.get(), line.get(), tag.c_str(), &v, 1);
1160 else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1162 ret = bcf_update_format_float(header->hdr.get(), line.get(), tag.c_str(), &v2, 1);
1170 # if defined(VERBOSE)
1171 std::cerr <<
"couldn't set format " + tag +
" corectly.\n";
1181 kstring_t s = {0, 0,
nullptr};
1182 kputsn(vcfline.c_str(), vcfline.length(), &s);
1183 ret = vcf_parse1(&s, header->hdr.get(), line.get());
1185 if(ret > 0)
throw std::runtime_error(
"error parsing: " + vcfline +
"\n");
1186 if(line->errcode == BCF_ERR_CTG_UNDEF)
1188 std::string contig(bcf_hdr_id2name(header->hdr.get(), line->rid));
1189 auto hdr_d = bcf_hdr_dup(header->hdr.get());
1190 auto hrec = bcf_hdr_id2hrec(hdr_d, BCF_DT_CTG, 0, line->rid);
1191 bcf_hdr_destroy(hdr_d);
1193 throw std::runtime_error(
"contig" + contig +
" unknow and not found in the header.\n");
1194 ret = bcf_hdr_add_hrec(header->hdr.get(), hrec);
1195 bcf_hrec_destroy(hrec);
1196 if(ret < 0)
throw std::runtime_error(
"error adding contig " + contig +
" to header.\n");
1197 ret = bcf_hdr_sync(header->hdr.get());
1210 if(bcf_get_info(header->hdr.get(), line.get(),
"SVTYPE") ==
nullptr)
1224 if(
REF().length() > 1 && !
isSV())
return true;
1225 for(
int i = 1; i < line->n_allele; i++)
1227 std::string alt(line->d.allele[i]);
1228 if(alt ==
".")
return true;
1229 if(alt.length() !=
REF().length() && !
isSV())
return true;
1237 if(line->n_allele <= 2)
return false;
1245 if(
REF().length() > 1 || line->n_allele <= 2)
return false;
1246 for(
int i = 1; i < line->n_allele; i++)
1248 std::string snp(line->d.allele[i]);
1249 if(snp.length() != 1)
1262 if(
REF().length() > 1 || line->n_allele > 2)
return false;
1263 std::string snp(line->d.allele[1]);
1264 if(!(snp ==
"A" || snp ==
"C" || snp ==
"G" || snp ==
"T"))
1275 int type = bcf_has_variant_types(line.get(), VCF_SNP, bcf_match_overlap);
1276 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1277 if(type == 0)
return false;
1284 int type = bcf_has_variant_types(line.get(), VCF_INDEL, bcf_match_overlap);
1285 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1286 if(type == 0)
return false;
1293 int type = bcf_has_variant_types(line.get(), VCF_INS, bcf_match_overlap);
1294 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1295 if(type == 0)
return false;
1302 int type = bcf_has_variant_types(line.get(), VCF_DEL, bcf_match_overlap);
1303 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1304 if(type == 0)
return false;
1311 int type = bcf_has_variant_types(line.get(), VCF_MNP, bcf_match_overlap);
1312 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1313 if(type == 0)
return false;
1320 int type = bcf_has_variant_types(line.get(), VCF_BND, bcf_match_overlap);
1321 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1322 if(type == 0)
return false;
1329 int type = bcf_has_variant_types(line.get(), VCF_OTHER, bcf_match_overlap);
1330 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1331 if(type == 0)
return false;
1338 int type = bcf_has_variant_types(line.get(), VCF_OVERLAP, bcf_match_overlap);
1339 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1340 if(type == 0)
return false;
1347 return std::string(bcf_hdr_id2name(header->hdr.get(), line->rid));
1351 inline std::string
ID()
const
1353 return std::string(line->d.id);
1359 return line->pos + 1;
1365 line->rid = bcf_hdr_name2id(header->hdr.get(), s.c_str());
1375 inline void setID(
const std::string & s)
1377 bcf_update_id(header->hdr.get(), line.get(), s.c_str());
1383 bcf_update_alleles_str(header->hdr.get(), line.get(), s.c_str());
1395 bcf_float_set_missing(line->qual);
1401 int32_t tmpi = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, s.c_str());
1402 bcf_add_filter(header->hdr.get(), line.get(), tmpi);
1414 return line->pos + line->rlen;
1418 inline std::string
REF()
const
1420 return std::string(line->d.allele[0]);
1430 inline std::string
ALT()
const
1433 for(
int i = 1; i < line->n_allele; i++)
1435 s += std::string(line->d.allele[i]) +
",";
1437 if(s.length() > 1) s.pop_back();
1444 if(bcf_float_is_missing(line->qual))
1446 noneMissing =
false;
1447 return bcf_float_missing;
1458 if(line->d.n_flt == 0)
1462 else if(line->d.n_flt == 1)
1464 return std::string(bcf_hdr_int2id(header->hdr.get(), BCF_DT_ID, line->d.flt[0]));
1469 for(
int i = 1; i < line->d.n_flt; i++)
1471 s += std::string(bcf_hdr_int2id(header->hdr.get(), BCF_DT_ID, line->d.flt[i])) +
",";
1481 int32_t max_dt_id = header->hdr->n[BCF_DT_ID];
1482 kstring_t * s = (kstring_t *)calloc(1,
sizeof(kstring_t));
1486 for(
int i = 0; i < line->n_info; ++i)
1488 bcf_info_t * z = &line->d.info[i];
1489 if(!z->vptr)
continue;
1490 if(!first) kputc(
';', s);
1492 if(z->key < 0 || z->key >= max_dt_id || header->hdr->id[BCF_DT_ID][z->key].key == NULL)
1493 throw std::runtime_error(
"Invalid BCF and wrong INFO tag");
1494 kputs(header->hdr->id[BCF_DT_ID][z->key].key, s);
1495 if(z->len <= 0)
continue;
1502 if(z->v1.i == bcf_int8_missing)
1508 if(z->v1.i == bcf_int16_missing)
1514 if(z->v1.i == bcf_int32_missing)
1520 if(z->v1.i == bcf_int64_missing)
1526 if(bcf_float_is_missing(z->v1.f))
1535 throw std::runtime_error(
"Unexpected type in INFO");
1539 bcf_fmt_array(s, z->len, z->type, z->vptr);
1541 if(first) kputc(
'.', s);
1545 std::string out = std::string(s->s, s->l);
1593 std::shared_ptr<htsFile> fp;
1594 std::shared_ptr<hts_idx_t> hidx;
1595 std::shared_ptr<tbx_t> tidx;
1596 std::shared_ptr<hts_itr_t> itr;
1628 BcfReader(
const std::string & file,
const std::string & region) : fname(file)
1631 if(file !=
"-" && !region.empty())
setRegion(region);
1645 BcfReader(
const std::string & file,
const std::string & region,
const std::string & samples) : fname(file)
1648 if(file !=
"-" && !region.empty())
setRegion(region);
1656 if(hidx) hidx.reset();
1657 if(itr) itr.reset();
1658 if(tidx) tidx.reset();
1662 void open(
const std::string & file)
1664 if(!fname.empty() && fname != file)
1665 throw std::runtime_error(
"does not support re-open a file yet. " + fname);
1668 if(!fp)
throw std::invalid_argument(
"I/O error: input file is invalid");
1669 enum htsExactFormat hts_format = hts_get_format(fp.get())->format;
1670 if(hts_format == bcf) isBcf =
true;
1671 auto h = bcf_hdr_read(fp.get());
1676 if(file ==
"-")
return;
1690 return hts_set_threads(fp.get(), n);
1709 catch(
const std::invalid_argument & e)
1713 catch(
const std::runtime_error & e)
1755 std::string::size_type n;
1757 if((n = region.find(
'-')) == std::string::npos)
1759 if((n = region.find(
':')) != std::string::npos)
1761 region +=
"-" + region.substr(n + 1, std::string::npos);
1769 if(itr) itr.reset();
1771 itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(),
header.hdr.get(),
"."),
1774 itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(),
header.hdr.get(), region.c_str()),
1779 if(tidx.get() ==
nullptr)
throw std::invalid_argument(
" no tabix index found!");
1780 if(itr) itr.reset();
1784 itr = std::shared_ptr<hts_itr_t>(tbx_itr_querys(tidx.get(), region.c_str()),
1787 if(itr.get() ==
nullptr)
1788 throw std::runtime_error(
"region was not found! make sure the region format is correct");
1796 if(itr.get() !=
nullptr)
1800 ret = bcf_itr_next(fp.get(), itr.get(), r.line.get());
1801 bcf_subset_format(r.header->hdr.get(), r.line.get());
1802 bcf_unpack(r.line.get(), BCF_UN_ALL);
1807 kstring_t s = {0, 0,
nullptr};
1808 int slen = tbx_itr_next(fp.get(), tidx.get(), itr.get(), &s);
1811 ret = vcf_parse1(&s, r.header->hdr.get(), r.line.get());
1812 bcf_unpack(r.line.get(), BCF_UN_ALL);
1815 return (ret <= 0) && (slen > 0);
1820 ret = bcf_read(fp.get(), r.header->hdr.get(), r.line.get());
1822 bcf_unpack(r.line.get(), BCF_UN_ALL);
1835 std::shared_ptr<htsFile> fp;
1838 bool isHeaderWritten =
false;
1856 BcfWriter(
const std::string & fname, std::string version =
"VCF4.1")
1883 BcfWriter(
const std::string & fname,
const std::string & version,
const std::string & mode)
1909 void open(
const std::string & fname)
1911 auto mode = details::getMode(fname,
"w");
1913 if(!fp)
throw std::invalid_argument(
"I/O error: input file is invalid");
1925 void open(
const std::string & fname,
const std::string & mode)
1928 if(!fp)
throw std::invalid_argument(
"I/O error: input file is invalid");
1948 if(!isHeaderWritten && !
writeHeader())
throw std::runtime_error(
"could not write header\n");
1949 kstring_t s = {0, 0,
nullptr};
1950 kputsn(vcfline.c_str(), vcfline.length(), &s);
1951 ret = vcf_parse1(&s,
header.hdr.get(), b.get());
1953 if(ret > 0)
throw std::runtime_error(
"error parsing: " + vcfline +
"\n");
1954 if(b->errcode == BCF_ERR_CTG_UNDEF)
1956 throw std::runtime_error(
"contig id " + (std::string)bcf_hdr_id2name(
header.hdr.get(), b->rid)
1957 +
" not found in the header. please run header->AddContig() first.\n");
1959 ret = bcf_write(fp.get(),
header.hdr.get(), b.get());
1960 if(ret != 0)
throw std::runtime_error(
"error writing: " + vcfline +
"\n");
1966 ret = bcf_hdr_write(fp.get(),
header.hdr.get());
1967 if(ret == 0)
return isHeaderWritten =
true;
1975 if(bcf_write(fp.get(), v.header->hdr.get(), v.line.get()) < 0)
return false;
Stream in variants from compressed/uncompressed VCF/BCF file or stdin.
Definition: vcfpp.h:1591
void setSamples(const std::string &samples)
explicitly stream to specific samples
Definition: vcfpp.h:1739
BcfReader()=default
Construct an empty BcfReader.
void open(const std::string &file)
Open a VCF/BCF/STDIN file for streaming in.
Definition: vcfpp.h:1662
BcfReader(const std::string &file, const std::string ®ion, const std::string &samples)
construct a vcf/bcf reader with subset samples in target region
Definition: vcfpp.h:1645
BcfHeader header
a BcfHeader object
Definition: vcfpp.h:1602
int setThreads(int n)
set the number of threads to use
Definition: vcfpp.h:1688
BcfReader(const std::string &file, const std::string ®ion)
construct a vcf/bcf reader with subset samples
Definition: vcfpp.h:1628
void setRegion(std::string region)
explicitly stream to specific region. throw invalid_argument error if index file not found....
Definition: vcfpp.h:1753
bool getNextVariant(BcfRecord &r)
read in the next variant
Definition: vcfpp.h:1793
int nsamples
number of samples in the VCF
Definition: vcfpp.h:1604
BcfReader(const std::string &file)
construct a vcf/bcf reader from file.
Definition: vcfpp.h:1618
void close()
Close the VCF file and its associated files.
Definition: vcfpp.h:1653
int getVariantsCount(const std::string ®ion)
count the number of variants by iterating through a given region.
Definition: vcfpp.h:1725
int getStatus(const std::string ®ion)
query the status of a given region in the VCF
Definition: vcfpp.h:1701
std::vector< std::string > SamplesName
a vector of samples name in the VCF
Definition: vcfpp.h:1606
Object represents a variant record in the VCF, offering methods to access and modify fields.
Definition: vcfpp.h:540
float QUAL()
return QUAL value
Definition: vcfpp.h:1442
bool isIndel() const
return boolean value indicates if current variant is exclusively INDEL
Definition: vcfpp.h:1221
bool hasINDEL() const
return boolean value indicates if current variant has INDEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1282
void setRefAlt(const std::string &s)
set REF and ALT alleles given a string seperated by comma
Definition: vcfpp.h:1381
isValidGT< T > getGenotypes(T &v)
fill in the input vector with genotypes of 0 and 1. only works for ploidy<=2. Genotypes with missing ...
Definition: vcfpp.h:621
void setPOS(int64_t p)
modify position given 1-based value
Definition: vcfpp.h:1369
bool hasINS() const
return boolean value indicates if current variant has INS type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1291
std::string allINFO()
return raw INFO column as string. recommend to use getINFO for specific tag.
Definition: vcfpp.h:1479
int ploidy() const
return the number of ploidy of current variant
Definition: vcfpp.h:1558
void removeINFO(std::string tag)
remove the given tag from INFO of the variant
Definition: vcfpp.h:1007
BcfRecord(BcfHeader &h)
constructor with a given BcfHeader object
Definition: vcfpp.h:563
std::string CHROM() const
return CHROM name
Definition: vcfpp.h:1345
void setID(const std::string &s)
update ID
Definition: vcfpp.h:1375
isValidFMT< T > setFORMAT(std::string tag, const T &v)
set tag values for all samples in FORMAT using given vector
Definition: vcfpp.h:1114
int64_t POS() const
return 1-base position
Definition: vcfpp.h:1357
std::string ID() const
return ID field
Definition: vcfpp.h:1351
isScalar< T > setFORMAT(std::string tag, const T &v)
set tag for a single sample in FORMAT using given singular value. this works only when there is one s...
Definition: vcfpp.h:1152
bool isSV() const
return boolean value indicates if current variant is Structual Variant or not
Definition: vcfpp.h:1208
void setQUAL(float q)
modify the QUAL value
Definition: vcfpp.h:1387
bool hasSNP() const
return boolean value indicates if current variant has SNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1273
std::vector< char > isGenoMissing
if there is "." in GT for the sample, then it's coded as missing (TRUE)
Definition: vcfpp.h:556
std::string FILTER()
return raw FILTER column as string
Definition: vcfpp.h:1456
void swap_REF_ALT()
swap REF and ALT for biallelic SNP
Definition: vcfpp.h:1424
std::string REF() const
return raw REF alleles as string
Definition: vcfpp.h:1418
std::string asString() const
return current variant as raw string
Definition: vcfpp.h:597
int64_t Start() const
return 0-base start of the variant (can be any type)
Definition: vcfpp.h:1406
bool hasBND() const
return boolean value indicates if current variant has BND type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1318
friend std::ostream & operator<<(std::ostream &out, const BcfRecord &v)
stream out the variant
Definition: vcfpp.h:590
isScalar< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:940
void resetHeader(BcfHeader &h)
reset the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:584
bool hasMNP() const
return boolean value indicates if current variant has MNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1309
bool allPhased() const
return boolean value indicates if genotypes of all samples are phased
Definition: vcfpp.h:1552
bool hasDEL() const
return boolean value indicates if current variant has DEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1300
bool isMultiAllelicSNP() const
return boolean value indicates if current variant is exclusively multiallelic SNP sites
Definition: vcfpp.h:1242
void setPhasing(const std::vector< char > &v)
set phasing status for all diploid samples using given vector
Definition: vcfpp.h:1079
bool setGenotypes(const std::vector< int > &v)
set genotypes from scratch even if genotypes not present
Definition: vcfpp.h:1038
std::string ALT() const
return raw ALT alleles as string
Definition: vcfpp.h:1430
bool hasOTHER() const
return boolean value indicates if current variant has OTHER type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1327
isString< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:915
bool isSNP() const
return boolean value indicates if current variant is exclusively biallelic SNP. Note ALT=* are skippe...
Definition: vcfpp.h:1259
bool isMultiAllelics() const
return boolean value indicates if current variant is multiallelic sites
Definition: vcfpp.h:1235
void setCHR(const std::string &s)
modify CHROM value
Definition: vcfpp.h:1363
int64_t End() const
return 0-base end of the variant (can be any type)
Definition: vcfpp.h:1412
void setPloidy(int v)
in a rare case, one may want to set the number of ploidy manually
Definition: vcfpp.h:1564
isFormatVector< T > getFORMAT(std::string tag, T &v)
get tag value in FORMAT
Definition: vcfpp.h:752
void initHeader(BcfHeader &h)
initilize the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:574
void addLineFromString(const std::string &vcfline)
add one variant record from given string
Definition: vcfpp.h:1179
bool getFORMAT(std::string tag, std::vector< std::string > &v)
get tag value in FORMAT
Definition: vcfpp.h:800
bool isNoneMissing() const
if all samples have non missing values for the tag in FORMAT
Definition: vcfpp.h:1202
void setQUAL(char q)
modify the QUAL value
Definition: vcfpp.h:1393
void setFILTER(const std::string &s)
modify the FILTER value
Definition: vcfpp.h:1399
isValidInfo< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:975
BcfRecord()=default
empty constructor. call init() afterwards
std::vector< char > gtPhase
vector of nsamples length. keep track of the phasing status of each sample
Definition: vcfpp.h:1583
void removeFORMAT(std::string tag)
remove the given tag from FORMAT of the variant
Definition: vcfpp.h:1087
std::vector< char > typeOfGT
vector of nsamples length. keep track of the type of genotype (one of GT_HOM_RR, GT_HET_RA,...
Definition: vcfpp.h:1580
isInfoVector< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:840
bool getGenotypes(std::vector< int > &v)
fill in the input vector with genotype values, 0, 1 or -9 (missing).
Definition: vcfpp.h:693
bool hasOVERLAP() const
return boolean value indicates if current variant has OVERLAP type defined in vcf....
Definition: vcfpp.h:1336
isScalar< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:878
Stream out variants to compressed/uncompressed VCF/BCF file or stdout.
Definition: vcfpp.h:1833
void close()
close the BcfWriter object.
Definition: vcfpp.h:1932
void writeLine(const std::string &vcfline)
copy a string to a vcf line
Definition: vcfpp.h:1946
BcfWriter(const std::string &fname, const std::string &version, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1883
BcfWriter(const std::string &fname, std::string version="VCF4.1")
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1856
void open(const std::string &fname)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1909
BcfWriter(const std::string &fname, const BcfHeader &h)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1867
void initalHeader(const BcfHeader &h)
initial a VCF header by pointing to header of another VCF
Definition: vcfpp.h:1940
BcfWriter(const std::string &fname, const BcfHeader &h, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1899
BcfWriter()=default
Construct an empty BcfWriter.
BcfHeader header
header object initialized by initalHeader
Definition: vcfpp.h:1843
void open(const std::string &fname, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1925
bool writeRecord(BcfRecord &v)
streams out the given variant of BcfRecord type
Definition: vcfpp.h:1972
bool writeHeader()
streams out the header
Definition: vcfpp.h:1964