46 # include <type_traits>
52 # include <htslib/hts.h>
53 # include <htslib/kstring.h>
54 # include <htslib/tbx.h>
55 # include <htslib/vcf.h>
56 # include <htslib/vcfutils.h>
63 typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<char>>::value
64 || std::is_same<T, std::vector<int>>::value
65 || std::is_same<T, std::vector<float>>::value,
70 typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<int>>::value
71 || std::is_same<T, std::vector<float>>::value,
75 using isInfoVector =
typename std::enable_if<std::is_same<T, std::vector<int>>::value
76 || std::is_same<T, std::vector<float>>::value,
80 using isScalar =
typename std::enable_if<std::is_same<T, int>::value || std::is_same<T, float>::value
81 || std::is_same<T, double>::value,
85 using isString =
typename std::enable_if<std::is_same<T, std::string>::value,
bool>::type;
88 using isValidGT =
typename std::enable_if<std::is_same<T, std::vector<bool>>::value
89 || std::is_same<T, std::vector<char>>::value,
93 using isFormatVector =
typename std::enable_if<std::is_same<T, std::vector<float>>::value
94 || std::is_same<T, std::vector<char>>::value
95 || std::is_same<T, std::vector<int>>::value,
102 isScalar<T> isMissing(T
const & v)
104 if(std::is_same<T, float>::value)
106 return bcf_float_is_missing(v);
108 else if(std::is_same<T, int>::value)
110 return bcf_int32_missing(v);
115 inline bool isEndWith(std::string
const & s, std::string
const & e)
117 if(s.length() >= e.length())
119 return (0 == s.compare(s.length() - e.length(), e.length(), e));
128 inline std::string getMode(std::string
const & fname, std::string mode =
"r")
130 if(isEndWith(fname,
"bcf.gz")) mode +=
"b";
131 if(isEndWith(fname,
"bcf")) mode +=
"bu";
132 if(isEndWith(fname,
"vcf.gz")) mode +=
"z";
137 inline std::vector<std::string> split_string(
const std::string & s,
const std::string & separators)
139 std::vector<std::string> ret;
140 bool is_seperator[256] = {
false};
141 for(
auto & ch : separators)
143 is_seperator[(
unsigned int)ch] =
true;
146 for(
int i = 0; i <= (int)s.size(); i++)
148 if(is_seperator[(uint8_t)s[i]] || i == (int)s.size())
150 ret.push_back(std::string(s.begin() + begin, s.begin() + i));
160 void operator()(htsFile * x)
169 void operator()(hts_itr_t * x)
171 if(x) hts_itr_destroy(x);
178 void operator()(hts_idx_t * x)
180 if(x) hts_idx_destroy(x);
187 void operator()(tbx_t * x)
189 if(x) tbx_destroy(x);
196 void operator()(bcf1_t * x)
198 if(x) bcf_destroy(x);
205 void operator()(bcf_hdr_t * x)
207 if(x) bcf_hdr_destroy(x);
225 bcf_hdr_t * hdr = NULL;
226 bcf_hrec_t * hrec = NULL;
233 if(hrec) bcf_hrec_destroy(hrec);
234 if(hdr) bcf_hdr_destroy(hdr);
253 const std::string & number,
254 const std::string & type,
255 const std::string & description)
257 addLine(
"##INFO=<ID=" +
id +
",Number=" + number +
",Type=" + type +
",Description=\"" + description
268 const std::string & number,
269 const std::string & type,
270 const std::string & description)
272 addLine(
"##FORMAT=<ID=" +
id +
",Number=" + number +
",Type=" + type +
",Description=\"" + description
281 inline void addFILTER(
const std::string &
id,
const std::string & description)
283 addLine(
"##FILTER=<ID=" +
id +
",Description=\"" + description +
"\">");
291 addLine(
"##contig=<ID=" +
id +
">");
300 ret = bcf_hdr_append(hdr, str.c_str());
301 if(ret != 0)
throw std::runtime_error(
"could not add " + str +
" to header\n");
302 ret = bcf_hdr_sync(hdr);
303 if(ret != 0)
throw std::runtime_error(
"could not add " + str +
" to header\n");
309 bcf_hdr_add_sample(hdr, sample.c_str());
310 if(bcf_hdr_sync(hdr) != 0)
312 throw std::runtime_error(
"couldn't add the sample.\n");
319 kstring_t s = {0, 0, NULL};
320 if(bcf_hdr_format(hdr, 0, &s) == 0)
322 std::string out(s.s, s.l);
328 throw std::runtime_error(
"failed to convert formatted header to string");
335 std::vector<std::string> vec;
336 for(
int i = 0; i < bcf_hdr_nsamples(hdr); i++)
338 vec.push_back(std::string(hdr->samples[i]));
347 const char ** seqs = bcf_hdr_seqnames(hdr, &ret);
348 if(ret == 0) printf(
"there is no contig id in the header!\n");
349 std::vector<std::string> vec;
350 for(
int i = 0; i < ret; i++)
352 vec.push_back(std::string(seqs[i]));
365 int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag.c_str());
366 if(tag_id < 0)
return 0;
367 if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
371 else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
375 else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
388 bcf_hdr_remove(hdr, BCF_HL_CTG, tag.c_str());
394 bcf_hdr_remove(hdr, BCF_HL_INFO, tag.c_str());
400 bcf_hdr_remove(hdr, BCF_HL_FMT, tag.c_str());
406 bcf_hdr_remove(hdr, BCF_HL_FLT, tag.c_str());
416 auto ss = details::split_string(samples,
",");
418 if(nsamples != (
int)ss.size())
419 throw std::runtime_error(
"You provide either too few or too many samples");
420 kstring_t htxt = {0, 0, 0};
421 bcf_hdr_format(hdr, 1, &htxt);
423 int i = htxt.l - 2, ncols = 0;
424 while(i >= 0 && htxt.s[i] !=
'\n')
426 if(htxt.s[i] ==
'\t') ncols++;
429 if(i < 0 || strncmp(htxt.s + i + 1,
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", 45))
431 if(i > 0 && !strncmp(htxt.s + i + 1,
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", 38))
432 throw std::runtime_error(
"Error: missing FORMAT fields, cowardly refusing to add samples\n");
433 throw std::runtime_error(
"Could not parse the header: " + std::string(htxt.s));
440 if(htxt.s[i] ==
'\t') ncols++;
445 for(i = 0; i < nsamples; i++)
448 kputs(ss[i].c_str(), &htxt);
453 bcf_hdr_destroy(hdr);
454 hdr = bcf_hdr_init(
"r");
455 if(bcf_hdr_parse(hdr, htxt.s) < 0)
456 throw std::runtime_error(
"An error occurred while parsing the header\n");
468 ret = bcf_hdr_set_samples(hdr, samples.c_str(), 0);
471 throw std::runtime_error(
"the " + std::to_string(ret)
472 +
"-th sample are not in the VCF.\nparameter samples:" + samples);
479 bcf_hdr_set_version(hdr, version.c_str());
485 return bcf_hdr_nsamples(hdr);
503 bcf_hdr_t * hdr_d = NULL;
504 bcf_fmt_t * fmt = NULL;
505 bcf_info_t * info = NULL;
506 int32_t * gts = NULL;
507 int ndst, ret, nsamples;
508 bool noneMissing =
true;
509 bool isAllPhased =
false;
530 if(hdr_d) bcf_hdr_destroy(hdr_d);
537 if(!header->hdr)
throw std::runtime_error(
"please initial header first\n");
559 kstring_t s = {0, 0, NULL};
560 if(vcf_format(header->hdr, line.get(), &s) == 0)
562 std::string out(s.s, s.l);
568 throw std::runtime_error(
"couldn't format current record into a string.\n");
584 ret = bcf_get_genotypes(header->hdr, line.get(), >s, &ndst);
587 # if defined(VERBOSE)
588 std::cerr <<
"GT not present for current site. did you initilize the variant object?\n";
595 if(ret != nploidy * nsamples && nploidy > 0)
599 v.resize(nploidy * nsamples);
604 nploidy = ret / nsamples;
608 int i, j, nphased = 0;
610 fmt = bcf_get_fmt(header->hdr, line.get(),
"GT");
611 int nploidy_cur = ret / nsamples;
612 for(i = 0; i < nsamples; i++)
615 typeOfGT[i] = bcf_gt_type(fmt, i, NULL, NULL);
620 v[i * nploidy + 0] = 1;
621 for(j = 1; j < nploidy_cur; j++) v[i * nploidy + j] = 0;
625 for(j = 0; j < nploidy_cur; j++)
628 v[i * nploidy + j] = bcf_gt_allele(gts[j + i * nploidy_cur]) != 0;
632 gtPhase[i] = (gts[1 + i * nploidy_cur] & 1) == 1;
636 if(nphased == nsamples)
657 ret = bcf_get_genotypes(header->hdr, line.get(), >s, &ndst);
660 # if defined(VERBOSE)
661 std::cerr <<
"GT not present for current site. did you initilize the variant object?\n";
667 nploidy = ret / nsamples;
668 int i, j, nphased = 0;
670 for(i = 0; i < nsamples; i++)
672 int32_t * ptr = gts + i * nploidy;
674 for(j = 0; j < nploidy; j++)
677 if(ptr[j] == bcf_int32_vector_end)
break;
679 if(bcf_gt_is_missing(ptr[j]))
683 v[i * nploidy + j] = -9;
686 v[i * nploidy + j] = bcf_gt_allele(ptr[j]);
687 is_phased += bcf_gt_is_phased(ptr[j]);
689 if(nploidy == is_phased)
695 if(nphased == nsamples)
712 template<
typename T,
typename S =
typename T::value_type>
715 fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str());
718 throw std::invalid_argument(
"no FORMAT=" + tag +
" in the VCF header.\n");
726 ret = bcf_get_format_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
730 ret = bcf_get_format_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
734 ret = bcf_get_format_char(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
744 v = std::vector<S>(dst, dst + ret);
761 bool getFORMAT(std::string tag, std::vector<std::string> & v)
763 fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str());
766 throw std::invalid_argument(
"no FORMAT=" + tag +
" in the VCF header.\n");
773 ret = bcf_get_format_string(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
777 for(
int i = 0; i < nsamples; i++)
780 v.emplace_back(dst[i]);
800 template<
typename T,
typename S =
typename T::value_type>
801 isInfoVector<T>
getINFO(std::string tag, T & v)
803 info = bcf_get_info(header->hdr, line.get(), tag.c_str());
806 throw std::invalid_argument(
"no INFO=" + tag +
" in the VCF header.\n");
811 if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
813 ret = bcf_get_info_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
815 else if(info->type == BCF_BT_FLOAT)
817 ret = bcf_get_info_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
821 v = std::vector<S>(dst, dst + ret);
841 info = bcf_get_info(header->hdr, line.get(), tag.c_str());
844 throw std::invalid_argument(
"no INFO=" + tag +
" in the VCF header.\n");
849 if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
853 else if(info->type == BCF_BT_FLOAT)
861 # if defined(VERBOSE)
862 std::cerr <<
"there are multiple values for " + tag
863 +
" in INFO for current site. please use vector instead\n";
878 info = bcf_get_info(header->hdr, line.get(), tag.c_str());
881 throw std::invalid_argument(
"no INFO=" + tag +
" in the VCF header.\n");
883 if(info->type == BCF_BT_CHAR)
885 v = std::string(
reinterpret_cast<char *
>(info->vptr), info->vptr_len);
901 isScalar<T>
setINFO(std::string tag,
const T & v)
905 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
906 if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
908 ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), &v, 1);
910 else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
912 float v2 =
static_cast<float>(v);
913 ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), &v2, 1);
921 # if defined(VERBOSE)
922 std::cerr <<
"couldn't set " + tag +
" for this variant.\nplease add the tag in headerfirst.\n";
936 isValidInfo<T>
setINFO(std::string tag,
const T & v)
939 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
940 if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
942 ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
944 else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
946 ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
948 else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
950 ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), v.data());
959 # if defined(VERBOSE)
960 std::cerr <<
"couldn't set " + tag +
" for this variant.\nplease add the tag in headerfirst.\n";
970 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
971 if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
973 ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), NULL, 0);
975 else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
977 ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), NULL, 0);
979 else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
981 ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), NULL);
990 throw std::runtime_error(
"couldn't remove " + tag +
" for this variant.\n");
1003 nploidy = v.size() / nsamples;
1004 int32_t * gt = (int32_t *)malloc(v.size() *
sizeof(int32_t));
1005 for(i = 0; i < nsamples; i++)
1007 for(j = 0; j < nploidy; j++)
1009 k = i * nploidy + j;
1010 if(v[k] == -9 || v[k] == bcf_int32_missing)
1012 gt[k] = bcf_gt_missing;
1016 gt[k] = bcf_gt_phased(v[k]);
1020 gt[k] = bcf_gt_unphased(v[k]);
1024 if(bcf_update_genotypes(header->hdr, line.get(), gt, v.size()) < 0)
1027 # if defined(VERBOSE)
1028 std::cerr <<
"couldn't set genotypes correctly.\n";
1042 if((
int)v.size() != nsamples)
1043 throw std::runtime_error(
"the size of input vector is not matching the size of genotypes");
1051 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
1052 if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1054 ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), NULL, 0);
1056 else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
1058 ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), NULL, 0);
1060 else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1062 ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), NULL, 0);
1065 if(ret < 0)
throw std::runtime_error(
"couldn't remove " + tag +
" correctly.\n");
1074 template<
typename T>
1077 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
1078 if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1080 ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
1082 else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
1084 ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
1086 else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1088 ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
1097 # if defined(VERBOSE)
1098 std::cerr <<
"couldn't set format " + tag +
" corectly.\n";
1112 template<
typename T>
1116 int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
1117 if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1119 ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), &v, 1);
1121 else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1123 ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), &v2, 1);
1131 # if defined(VERBOSE)
1132 std::cerr <<
"couldn't set format " + tag +
" corectly.\n";
1142 kstring_t s = {0, 0, NULL};
1143 kputsn(vcfline.c_str(), vcfline.length(), &s);
1144 ret = vcf_parse1(&s, header->hdr, line.get());
1146 if(ret > 0)
throw std::runtime_error(
"error parsing: " + vcfline +
"\n");
1147 if(line->errcode == BCF_ERR_CTG_UNDEF)
1149 std::string contig(bcf_hdr_id2name(header->hdr, line->rid));
1150 hdr_d = bcf_hdr_dup(header->hdr);
1151 header->hrec = bcf_hdr_id2hrec(hdr_d, BCF_DT_CTG, 0, line->rid);
1152 if(header->hrec == NULL)
1153 throw std::runtime_error(
"contig" + contig +
" unknow and not found in the header.\n");
1154 ret = bcf_hdr_add_hrec(header->hdr, header->hrec);
1155 if(ret < 0)
throw std::runtime_error(
"error adding contig " + contig +
" to header.\n");
1156 ret = bcf_hdr_sync(header->hdr);
1169 if(bcf_get_info(header->hdr, line.get(),
"SVTYPE") == NULL)
1183 if(
REF().length() > 1 && !
isSV())
return true;
1184 for(
int i = 1; i < line->n_allele; i++)
1186 std::string alt(line->d.allele[i]);
1187 if(alt ==
".")
return true;
1188 if(alt.length() !=
REF().length() && !
isSV())
return true;
1196 if(line->n_allele <= 2)
return false;
1204 if(
REF().length() > 1 || line->n_allele <= 2)
return false;
1205 for(
int i = 1; i < line->n_allele; i++)
1207 std::string snp(line->d.allele[i]);
1208 if(snp.length() != 1)
1221 if(
REF().length() > 1 || line->n_allele > 2)
return false;
1222 std::string snp(line->d.allele[1]);
1223 if(!(snp ==
"A" || snp ==
"C" || snp ==
"G" || snp ==
"T"))
1234 int type = bcf_has_variant_types(line.get(), VCF_SNP, bcf_match_overlap);
1235 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1236 if(type == 0)
return false;
1243 int type = bcf_has_variant_types(line.get(), VCF_INDEL, bcf_match_overlap);
1244 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1245 if(type == 0)
return false;
1252 int type = bcf_has_variant_types(line.get(), VCF_INS, bcf_match_overlap);
1253 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1254 if(type == 0)
return false;
1261 int type = bcf_has_variant_types(line.get(), VCF_DEL, bcf_match_overlap);
1262 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1263 if(type == 0)
return false;
1270 int type = bcf_has_variant_types(line.get(), VCF_MNP, bcf_match_overlap);
1271 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1272 if(type == 0)
return false;
1279 int type = bcf_has_variant_types(line.get(), VCF_BND, bcf_match_overlap);
1280 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1281 if(type == 0)
return false;
1288 int type = bcf_has_variant_types(line.get(), VCF_OTHER, bcf_match_overlap);
1289 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1290 if(type == 0)
return false;
1297 int type = bcf_has_variant_types(line.get(), VCF_OVERLAP, bcf_match_overlap);
1298 if(type < 0)
throw std::runtime_error(
"something wrong with variant type\n");
1299 if(type == 0)
return false;
1306 return std::string(bcf_hdr_id2name(header->hdr, line->rid));
1310 inline std::string
ID()
const
1312 return std::string(line->d.id);
1318 return line->pos + 1;
1324 line->rid = bcf_hdr_name2id(header->hdr, s.c_str());
1334 inline void setID(
const std::string & s)
1336 bcf_update_id(header->hdr, line.get(), s.c_str());
1342 bcf_update_alleles_str(header->hdr, line.get(), s.c_str());
1354 bcf_float_set_missing(line->qual);
1360 int32_t tmpi = bcf_hdr_id2int(header->hdr, BCF_DT_ID, s.c_str());
1361 bcf_add_filter(header->hdr, line.get(), tmpi);
1373 return line->pos + line->rlen;
1377 inline std::string
REF()
const
1379 return std::string(line->d.allele[0]);
1389 inline std::string
ALT()
const
1392 for(
int i = 1; i < line->n_allele; i++)
1394 s += std::string(line->d.allele[i]) +
",";
1396 if(s.length() > 1) s.pop_back();
1403 if(bcf_float_is_missing(line->qual))
1405 noneMissing =
false;
1406 return bcf_float_missing;
1417 if(line->d.n_flt == 0)
1421 else if(line->d.n_flt == 1)
1423 return std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[0]));
1428 for(
int i = 1; i < line->d.n_flt; i++)
1430 s += std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[i])) +
",";
1440 int32_t max_dt_id = header->hdr->n[BCF_DT_ID];
1441 kstring_t * s = (kstring_t *)calloc(1,
sizeof(kstring_t));
1445 for(
int i = 0; i < line->n_info; ++i)
1447 bcf_info_t * z = &line->d.info[i];
1448 if(!z->vptr)
continue;
1449 if(!first) kputc(
';', s);
1451 if(z->key < 0 || z->key >= max_dt_id || header->hdr->id[BCF_DT_ID][z->key].key == NULL)
1452 throw std::runtime_error(
"Invalid BCF and wrong INFO tag");
1453 kputs(header->hdr->id[BCF_DT_ID][z->key].key, s);
1454 if(z->len <= 0)
continue;
1461 if(z->v1.i == bcf_int8_missing)
1467 if(z->v1.i == bcf_int16_missing)
1473 if(z->v1.i == bcf_int32_missing)
1479 if(z->v1.i == bcf_int64_missing)
1485 if(bcf_float_is_missing(z->v1.f))
1494 throw std::runtime_error(
"Unexpected type in INFO");
1498 bcf_fmt_array(s, z->len, z->type, z->vptr);
1500 if(first) kputc(
'.', s);
1504 std::string out = std::string(s->s, s->l);
1552 std::shared_ptr<htsFile> fp;
1553 std::shared_ptr<hts_idx_t> hidx;
1554 std::shared_ptr<tbx_t> tidx;
1555 std::shared_ptr<hts_itr_t> itr;
1556 kstring_t s = {0, 0, NULL};
1585 BcfReader(
const std::string & file,
const std::string & region) : fname(file)
1602 BcfReader(
const std::string & file,
const std::string & region,
const std::string & samples) : fname(file)
1618 if(hidx) hidx.reset();
1619 if(itr) itr.reset();
1620 if(tidx) tidx.reset();
1624 void open(
const std::string & file)
1626 if(!fname.empty() && fname != file)
1627 throw std::runtime_error(
"does not support re-open a file yet. " + fname);
1630 if(!fp)
throw std::invalid_argument(
"I/O error: input file is invalid");
1631 header.hdr = bcf_hdr_read(fp.get());
1639 return hts_set_threads(fp.get(), n);
1664 catch(
const std::invalid_argument & e)
1668 catch(
const std::runtime_error & e)
1713 if(details::isEndWith(fname,
"bcf") || details::isEndWith(fname,
"bcf.gz"))
1717 if(itr) itr.reset();
1719 itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(),
header.hdr,
"."),
1722 itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(),
header.hdr, region.c_str()),
1729 if(tidx.get() == NULL)
throw std::invalid_argument(
" no tabix index found!");
1730 if(itr) itr.reset();
1734 itr = std::shared_ptr<hts_itr_t>(tbx_itr_querys(tidx.get(), region.c_str()),
1737 if(itr.get() == NULL)
1738 throw std::runtime_error(
"region was not found! make sure the region format is correct");
1746 if(itr.get() != NULL)
1750 ret = bcf_itr_next(fp.get(), itr.get(), r.line.get());
1751 bcf_unpack(r.line.get(), BCF_UN_ALL);
1756 int slen = tbx_itr_next(fp.get(), tidx.get(), itr.get(), &s);
1759 ret = vcf_parse1(&s, r.header->hdr, r.line.get());
1760 bcf_unpack(r.line.get(), BCF_UN_ALL);
1762 return (ret <= 0) && (slen > 0);
1767 ret = bcf_read(fp.get(), r.header->hdr, r.line.get());
1769 bcf_unpack(r.line.get(), BCF_UN_ALL);
1782 std::shared_ptr<htsFile> fp;
1785 bool isHeaderWritten =
false;
1800 BcfWriter(
const std::string & fname, std::string version =
"VCF4.1")
1828 BcfWriter(
const std::string & fname,
const std::string & version,
const std::string & mode)
1857 void open(
const std::string & fname)
1859 auto mode = details::getMode(fname,
"w");
1861 if(!fp)
throw std::invalid_argument(
"I/O error: input file is invalid");
1873 void open(
const std::string & fname,
const std::string & mode)
1876 if(!fp)
throw std::invalid_argument(
"I/O error: input file is invalid");
1890 header.hdr = bcf_hdr_init(
"w");
1901 void copyHeader(
const std::string & vcffile, std::string samples =
"-")
1903 htsFile * fp2 = hts_open(vcffile.c_str(),
"r");
1904 if(!fp2)
throw std::invalid_argument(
"I/O error: input file is invalid");
1907 bcf_hdr_t * hfull = bcf_hdr_read(fp2);
1908 header.hdr = bcf_hdr_subset(hfull, 0, 0, 0);
1909 bcf_hdr_remove(
header.hdr, BCF_HL_FMT, NULL);
1910 bcf_hdr_destroy(hfull);
1914 header.hdr = bcf_hdr_read(fp2);
1924 if(!isHeaderWritten && !
writeHeader())
throw std::runtime_error(
"could not write header\n");
1925 kstring_t s = {0, 0, NULL};
1926 kputsn(vcfline.c_str(), vcfline.length(), &s);
1927 ret = vcf_parse1(&s, hp->hdr, b.get());
1929 if(ret > 0)
throw std::runtime_error(
"error parsing: " + vcfline +
"\n");
1930 if(b->errcode == BCF_ERR_CTG_UNDEF)
1932 throw std::runtime_error(
"contig id " + (std::string)bcf_hdr_id2name(hp->hdr, b->rid)
1933 +
" not found in the header. please run header->AddContig() first.\n");
1935 ret = bcf_write(fp.get(), hp->hdr, b.get());
1936 if(ret != 0)
throw std::runtime_error(
"error writing: " + vcfline +
"\n");
1942 ret = bcf_hdr_write(fp.get(), hp->hdr);
1943 if(ret == 0)
return isHeaderWritten =
true;
1951 if(bcf_write(fp.get(), v.header->hdr, v.line.get()) < 0)
return false;
Stream in variants from compressed/uncompressed VCF/BCF file or stdin.
Definition: vcfpp.h:1550
void setSamples(const std::string &samples)
explicitly stream to specific samples
Definition: vcfpp.h:1694
void open(const std::string &file)
Open a VCF/BCF/STDIN file for streaming in.
Definition: vcfpp.h:1624
BcfReader(const std::string &file, const std::string ®ion, const std::string &samples)
construct a vcf/bcf reader with subset samples in target region
Definition: vcfpp.h:1602
BcfHeader header
a BcfHeader object
Definition: vcfpp.h:1562
int setThreads(int n)
set the number of threads to use
Definition: vcfpp.h:1637
BcfReader(const std::string &file, const std::string ®ion)
construct a vcf/bcf reader with subset samples
Definition: vcfpp.h:1585
bool getNextVariant(BcfRecord &r)
read in the next variant
Definition: vcfpp.h:1743
BcfReader()
Construct an empty BcfReader.
Definition: vcfpp.h:1569
int nsamples
number of samples in the VCF
Definition: vcfpp.h:1564
const BcfHeader & getHeader() const
return a BcfHeader object
Definition: vcfpp.h:1643
BcfReader(const std::string &file)
construct a vcf/bcf reader from file.
Definition: vcfpp.h:1575
void close()
Close the VCF file and its associated files.
Definition: vcfpp.h:1615
void setRegion(const std::string ®ion)
explicitly stream to specific region. throw invalid_argument error if index file not found....
Definition: vcfpp.h:1708
int getVariantsCount(const std::string ®ion)
count the number of variants by iterating through a given region.
Definition: vcfpp.h:1680
int getStatus(const std::string ®ion)
query the status of a given region in the VCF
Definition: vcfpp.h:1656
std::vector< std::string > SamplesName
a vector of samples name in the VCF
Definition: vcfpp.h:1566
Object represents a variant record in the VCF, offering methods to access and modify fields.
Definition: vcfpp.h:496
float QUAL()
return QUAL value
Definition: vcfpp.h:1401
bool isIndel() const
return boolean value indicates if current variant is exclusively INDEL
Definition: vcfpp.h:1180
bool hasINDEL() const
return boolean value indicates if current variant has INDEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1241
void setRefAlt(const std::string &s)
set REF and ALT alleles given a string seperated by comma
Definition: vcfpp.h:1340
isValidGT< T > getGenotypes(T &v)
fill in the input vector with genotypes of 0 and 1. only works for ploidy<=2. Genotypes with missing ...
Definition: vcfpp.h:581
void setPOS(int64_t p)
modify position given 1-based value
Definition: vcfpp.h:1328
bool hasINS() const
return boolean value indicates if current variant has INS type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1250
BcfRecord()
empty constructor. call init() afterwards
Definition: vcfpp.h:519
std::string allINFO()
return raw INFO column as string. recommend to use getINFO for specific tag.
Definition: vcfpp.h:1438
int ploidy() const
return the number of ploidy of current variant
Definition: vcfpp.h:1517
void removeINFO(std::string tag)
remove the given tag from INFO of the variant
Definition: vcfpp.h:968
BcfRecord(BcfHeader &h)
constructor with a given BcfHeader object
Definition: vcfpp.h:522
std::string CHROM() const
return CHROM name
Definition: vcfpp.h:1304
void setID(const std::string &s)
update ID
Definition: vcfpp.h:1334
isValidFMT< T > setFORMAT(std::string tag, const T &v)
set tag values for all samples in FORMAT using given vector
Definition: vcfpp.h:1075
int64_t POS() const
return 1-base position
Definition: vcfpp.h:1316
std::string ID() const
return ID field
Definition: vcfpp.h:1310
isScalar< T > setFORMAT(std::string tag, const T &v)
set tag for a single sample in FORMAT using given singular value. this works only when there is one s...
Definition: vcfpp.h:1113
bool isSV() const
return boolean value indicates if current variant is Structual Variant or not
Definition: vcfpp.h:1167
void setQUAL(float q)
modify the QUAL value
Definition: vcfpp.h:1346
bool hasSNP() const
return boolean value indicates if current variant has SNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1232
std::vector< char > isGenoMissing
if there is "." in GT for the sample, then it's coded as missing (TRUE)
Definition: vcfpp.h:515
std::string FILTER()
return raw FILTER column as string
Definition: vcfpp.h:1415
void swap_REF_ALT()
swap REF and ALT for biallelic SNP
Definition: vcfpp.h:1383
std::string REF() const
return raw REF alleles as string
Definition: vcfpp.h:1377
std::string asString() const
return current variant as raw string
Definition: vcfpp.h:557
int64_t Start() const
return 0-base start of the variant (can be any type)
Definition: vcfpp.h:1365
bool hasBND() const
return boolean value indicates if current variant has BND type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1277
friend std::ostream & operator<<(std::ostream &out, const BcfRecord &v)
stream out the variant
Definition: vcfpp.h:550
isScalar< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:901
void resetHeader(BcfHeader &h)
reset the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:544
bool hasMNP() const
return boolean value indicates if current variant has MNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1268
bool allPhased() const
return boolean value indicates if genotypes of all samples are phased
Definition: vcfpp.h:1511
bool hasDEL() const
return boolean value indicates if current variant has DEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1259
bool isMultiAllelicSNP() const
return boolean value indicates if current variant is exclusively multiallelic SNP sites
Definition: vcfpp.h:1201
void setPhasing(const std::vector< char > &v)
set phasing status for all diploid samples using given vector
Definition: vcfpp.h:1040
bool setGenotypes(const std::vector< int > &v)
set genotypes from scratch even if genotypes not present
Definition: vcfpp.h:999
std::string ALT() const
return raw ALT alleles as string
Definition: vcfpp.h:1389
bool hasOTHER() const
return boolean value indicates if current variant has OTHER type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1286
isString< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:876
bool isSNP() const
return boolean value indicates if current variant is exclusively biallelic SNP. Note ALT=* are skippe...
Definition: vcfpp.h:1218
bool isMultiAllelics() const
return boolean value indicates if current variant is multiallelic sites
Definition: vcfpp.h:1194
void setCHR(const std::string &s)
modify CHROM value
Definition: vcfpp.h:1322
int64_t End() const
return 0-base end of the variant (can be any type)
Definition: vcfpp.h:1371
void setPloidy(int v)
in a rare case, one may want to set the number of ploidy manually
Definition: vcfpp.h:1523
isFormatVector< T > getFORMAT(std::string tag, T &v)
get tag value in FORMAT
Definition: vcfpp.h:713
void initHeader(BcfHeader &h)
initilize the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:534
void addLineFromString(const std::string &vcfline)
add one variant record from given string
Definition: vcfpp.h:1140
bool getFORMAT(std::string tag, std::vector< std::string > &v)
get tag value in FORMAT
Definition: vcfpp.h:761
bool isNoneMissing() const
if all samples have non missing values for the tag in FORMAT
Definition: vcfpp.h:1161
void setQUAL(char q)
modify the QUAL value
Definition: vcfpp.h:1352
void setFILTER(const std::string &s)
modify the FILTER value
Definition: vcfpp.h:1358
isValidInfo< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:936
std::vector< char > gtPhase
vector of nsamples length. keep track of the phasing status of each sample
Definition: vcfpp.h:1542
void removeFORMAT(std::string tag)
remove the given tag from FORMAT of the variant
Definition: vcfpp.h:1048
std::vector< char > typeOfGT
vector of nsamples length. keep track of the type of genotype (one of GT_HOM_RR, GT_HET_RA,...
Definition: vcfpp.h:1539
isInfoVector< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:801
bool getGenotypes(std::vector< int > &v)
fill in the input vector with genotype values, 0, 1 or -9 (missing).
Definition: vcfpp.h:654
bool hasOVERLAP() const
return boolean value indicates if current variant has OVERLAP type defined in vcf....
Definition: vcfpp.h:1295
isScalar< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:839
Stream out variants to compressed/uncompressed VCF/BCF file or stdout.
Definition: vcfpp.h:1780
void close()
close the BcfWriter object.
Definition: vcfpp.h:1880
void writeLine(const std::string &vcfline)
copy a string to a vcf line
Definition: vcfpp.h:1922
BcfWriter(const std::string &fname, const std::string &version, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1828
void copyHeader(const std::string &vcffile, std::string samples="-")
copy header of given VCF and restrict on samples. if samples=="", FORMAT removed and only sites left
Definition: vcfpp.h:1901
BcfWriter(const std::string &fname, std::string version="VCF4.1")
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1800
void open(const std::string &fname)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1857
BcfWriter(const std::string &fname, const BcfHeader &h)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1812
void initalHeader(const BcfHeader &h)
initial a VCF header by pointing to header of another VCF
Definition: vcfpp.h:1895
BcfWriter(const std::string &fname, const BcfHeader &h, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1845
void initalHeader(std::string version="VCF4.1")
initial a VCF header using the internal template given a specific version. VCF4.1 is the default
Definition: vcfpp.h:1888
BcfHeader header
header object initialized by initalHeader
Definition: vcfpp.h:1790
BcfWriter()
Construct an empty BcfWriter.
Definition: vcfpp.h:1793
void open(const std::string &fname, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1873
bool writeRecord(BcfRecord &v)
streams out the given variant of BcfRecord type
Definition: vcfpp.h:1948
bool writeHeader()
streams out the header
Definition: vcfpp.h:1940