vcfpp
vcfpp.h
1 /*******************************************************************************
2  * @file https://github.com/Zilong-Li/vcfpp/vcfpp.h
3  * @author Zilong Li
4  * @email zilong.dk@gmail.com
5  * @version v0.5.0
6  * @breif a single C++ file for manipulating VCF
7  * Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file.
8  ******************************************************************************/
9 
36 #pragma once
37 
38 #ifndef VCFPP_H_
39 # define VCFPP_H_
40 
41 # include <iostream>
42 # include <memory>
43 # include <string>
44 # include <type_traits>
45 # include <vector>
46 
47 // make sure you have htslib installed
48 extern "C"
49 {
50 # include <htslib/kstring.h>
51 # include <htslib/tbx.h>
52 # include <htslib/vcf.h>
53 # include <htslib/vcfutils.h>
54 }
55 
56 namespace vcfpp
57 {
58 template<typename T>
59 using isValidFMT =
60  typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<char>>::value
61  || std::is_same<T, std::vector<int>>::value
62  || std::is_same<T, std::vector<float>>::value,
63  bool>::type;
64 
65 template<typename T>
66 using isValidInfo =
67  typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<int>>::value
68  || std::is_same<T, std::vector<float>>::value,
69  bool>::type;
70 
71 template<typename T>
72 using isInfoVector = typename std::enable_if<std::is_same<T, std::vector<int>>::value
73  || std::is_same<T, std::vector<float>>::value,
74  bool>::type;
75 
76 template<typename T>
77 using isScalar = typename std::enable_if<std::is_same<T, int>::value || std::is_same<T, float>::value
78  || std::is_same<T, double>::value,
79  bool>::type;
80 
81 template<typename T>
82 using isString = typename std::enable_if<std::is_same<T, std::string>::value, bool>::type;
83 
84 template<typename T>
85 using isValidGT = typename std::enable_if<std::is_same<T, std::vector<bool>>::value
86  || std::is_same<T, std::vector<char>>::value,
87  bool>::type;
88 
89 template<typename T>
90 using isFormatVector = typename std::enable_if<std::is_same<T, std::vector<float>>::value
91  || std::is_same<T, std::vector<char>>::value
92  || std::is_same<T, std::vector<int>>::value,
93  bool>::type;
94 
95 template<typename T>
96 isScalar<T> isMissing(T const & v)
97 {
98  if(std::is_same<T, float>::value)
99  {
100  return bcf_float_is_missing(v);
101  }
102  else if(std::is_same<T, int>::value)
103  {
104  return bcf_int32_missing(v);
105  }
106 }
107 
108 // test if a string is end with another string
109 inline bool isEndWith(std::string const & s, std::string const & e)
110 {
111  if(s.length() >= e.length())
112  {
113  return (0 == s.compare(s.length() - e.length(), e.length(), e));
114  }
115  else
116  {
117  return false;
118  }
119 }
120 
121 // determinate the mode for read/write the compressed/uncompressed VCF/BCF
122 inline std::string getMode(std::string const & fname, std::string mode = "r")
123 {
124  if(isEndWith(fname, "bcf.gz")) mode += "b";
125  if(isEndWith(fname, "bcf")) mode += "bu";
126  if(isEndWith(fname, "vcf.gz")) mode += "z";
127  return mode;
128 }
129 
130 // string split by separator
131 inline std::vector<std::string> split_string(const std::string & s, const std::string & separators)
132 {
133  std::vector<std::string> ret;
134  bool is_seperator[256] = {false};
135  for(auto & ch : separators)
136  {
137  is_seperator[(unsigned int)ch] = true;
138  }
139  int begin = 0;
140  for(int i = 0; i <= (int)s.size(); i++)
141  {
142  if(is_seperator[(uint8_t)s[i]] || i == (int)s.size())
143  {
144  ret.push_back(std::string(s.begin() + begin, s.begin() + i));
145  begin = i + 1;
146  }
147  }
148  return ret;
149 }
150 
151 // deleter for htsFile
153 {
154  void operator()(htsFile * x)
155  {
156  if(x) hts_close(x);
157  }
158 };
159 
160 // deleter for variant
162 {
163  void operator()(bcf1_t * x)
164  {
165  if(x) bcf_destroy(x);
166  }
167 };
168 
169 // deleter for bcf header
171 {
172  void operator()(bcf_hdr_t * x)
173  {
174  if(x) bcf_hdr_destroy(x);
175  }
176 };
177 
184 {
185  friend class BcfRecord;
186  friend class BcfReader;
187  friend class BcfWriter;
188 
189  private:
190  bcf_hdr_t * hdr = NULL; // bcf header
191  bcf_hrec_t * hrec = NULL; // populate header
192 
193  public:
194  BcfHeader() {}
195 
196  ~BcfHeader()
197  {
198  if(hrec) bcf_hrec_destroy(hrec);
199  if(hdr) bcf_hdr_destroy(hdr);
200  }
201 
203  friend std::ostream & operator<<(std::ostream & out, const BcfHeader & h)
204  {
205  out << h.asString();
206  return out;
207  }
208 
209  // TODO: check if the value is valid for vcf specification
210 
217  inline void addINFO(const std::string & id,
218  const std::string & number,
219  const std::string & type,
220  const std::string & description)
221  {
222  addLine("##INFO=<ID=" + id + ",Number=" + number + ",Type=" + type + ",Description=\"" + description
223  + "\">");
224  }
225 
232  inline void addFORMAT(const std::string & id,
233  const std::string & number,
234  const std::string & type,
235  const std::string & description)
236  {
237  addLine("##FORMAT=<ID=" + id + ",Number=" + number + ",Type=" + type + ",Description=\"" + description
238  + "\">");
239  }
240 
246  inline void addFILTER(const std::string & id, const std::string & description)
247  {
248  addLine("##FILTER=<ID=" + id + ",Description=\"" + description + "\">");
249  }
250 
254  inline void addContig(const std::string & id)
255  {
256  addLine("##contig=<ID=" + id + ">");
257  }
258 
262  inline void addLine(const std::string & str)
263  {
264  int ret = 0;
265  ret = bcf_hdr_append(hdr, str.c_str());
266  if(ret != 0) throw std::runtime_error("could not add " + str + " to header\n");
267  ret = bcf_hdr_sync(hdr);
268  if(ret != 0) throw std::runtime_error("could not add " + str + " to header\n");
269  }
270 
272  inline void addSample(const std::string & sample) const
273  {
274  bcf_hdr_add_sample(hdr, sample.c_str());
275  if(bcf_hdr_sync(hdr) != 0)
276  {
277  throw std::runtime_error("couldn't add the sample.\n");
278  }
279  }
280 
282  inline std::string asString() const
283  {
284  kstring_t s = {0, 0, NULL}; // kstring
285  if(bcf_hdr_format(hdr, 0, &s) == 0) // append header string to s.s! append!
286  {
287  std::string out(s.s, s.l);
288  free(s.s);
289  return out;
290  }
291  else
292  {
293  throw std::runtime_error("failed to convert formatted header to string");
294  }
295  }
296 
298  std::vector<std::string> getSamples() const
299  {
300  std::vector<std::string> vec;
301  for(int i = 0; i < bcf_hdr_nsamples(hdr); i++)
302  {
303  vec.push_back(std::string(hdr->samples[i]));
304  }
305  return vec;
306  }
307 
309  std::vector<std::string> getSeqnames() const
310  {
311  int ret = 0;
312  const char ** seqs = bcf_hdr_seqnames(hdr, &ret);
313  if(ret == 0) printf("there is no contig id in the header!\n");
314  std::vector<std::string> vec;
315  for(int i = 0; i < ret; i++)
316  {
317  vec.push_back(std::string(seqs[i]));
318  }
319  free(seqs);
320  return vec;
321  }
322 
328  inline int getFormatType(std::string tag) const
329  {
330  int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag.c_str());
331  if(tag_id < 0) return 0;
332  if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
333  {
334  return 1;
335  }
336  else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
337  {
338  return 2;
339  }
340  else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
341  {
342  return 3;
343  }
344  else
345  {
346  return 0;
347  }
348  }
349 
351  inline void removeContig(std::string tag) const
352  {
353  bcf_hdr_remove(hdr, BCF_HL_CTG, tag.c_str());
354  }
355 
357  inline void removeInfo(std::string tag) const
358  {
359  bcf_hdr_remove(hdr, BCF_HL_INFO, tag.c_str());
360  }
361 
363  inline void removeFormat(std::string tag) const
364  {
365  bcf_hdr_remove(hdr, BCF_HL_FMT, tag.c_str());
366  }
367 
369  inline void removeFilter(std::string tag) const
370  {
371  bcf_hdr_remove(hdr, BCF_HL_FLT, tag.c_str());
372  }
373 
378  inline void setSamples(const std::string & samples) const
379  {
380  int ret = 0;
381  ret = bcf_hdr_set_samples(hdr, samples.c_str(), 0);
382  if(ret != 0)
383  {
384  throw std::runtime_error("the " + std::to_string(ret)
385  + "-th sample are not in the VCF.\nparameter samples:" + samples);
386  }
387  }
388 
390  inline void setVersion(const std::string & version) const
391  {
392  bcf_hdr_set_version(hdr, version.c_str());
393  }
394 
396  inline int nSamples() const
397  {
398  return bcf_hdr_nsamples(hdr);
399  }
400 };
401 
409 {
410  friend class BcfReader;
411  friend class BcfWriter;
412 
413  private:
414  BcfHeader * header;
415  std::shared_ptr<bcf1_t> line = std::shared_ptr<bcf1_t>(bcf_init(), variant_close()); // variant
416  bcf_hdr_t * hdr_d = NULL; // a dup header by bcf_hdr_dup(header->hdr)
417  bcf_fmt_t * fmt = NULL;
418  bcf_info_t * info = NULL;
419  int32_t * gts = NULL;
420  int ndst, ret, nsamples;
421  bool noneMissing = true; // whenever parsing a tag have to reset this variable
422  bool isAllPhased = false;
423  int nploidy = 0; // the number of ploidy
424  int nvalues = 0; // the number of values for a tag in FORMAT
425 
426  public:
428  std::vector<char> isGenoMissing;
429 
430  public:
433 
436  {
437  initHeader(h);
438  }
439 
440  ~BcfRecord()
441  {
442  if(gts) free(gts);
443  if(hdr_d) bcf_hdr_destroy(hdr_d);
444  }
445 
448  {
449  header = &h;
450  if(!header->hdr) throw std::runtime_error("please initial header first\n");
451  nsamples = header->nSamples();
452  typeOfGT.resize(nsamples);
453  gtPhase.resize(nsamples, 0);
454  }
455 
458  {
459  header = &h;
460  }
461 
463  friend std::ostream & operator<<(std::ostream & out, const BcfRecord & v)
464  {
465  out << v.asString();
466  return out;
467  }
468 
470  inline std::string asString() const
471  {
472  kstring_t s = {0, 0, NULL}; // kstring
473  if(vcf_format(header->hdr, line.get(), &s) == 0)
474  {
475  std::string out(s.s, s.l);
476  free(s.s);
477  return out;
478  }
479  else
480  {
481  throw std::runtime_error("couldn't format current record into a string.\n");
482  }
483  }
484 
493  template<typename T>
494  isValidGT<T> getGenotypes(T & v)
495  {
496  ndst = 0;
497  ret = bcf_get_genotypes(header->hdr, line.get(), &gts, &ndst);
498  if(ret <= 0)
499  {
500 # if defined(VERBOSE)
501  std::cerr << "GT not present for current site. did you initilize the variant object?\n";
502 # endif
503  return false;
504  }
505  // if nploidy is not set manually. find the max nploidy using the first variant (eg. 2) resize v as
506  // max(nploidy)
507  // * nsamples (ret) NOTE: if ret == nsamples and only one sample then haploid
508  if(ret != nploidy * nsamples && nploidy > 0)
509  {
510  // rare case if nploidy is set manually. eg. only one sample. the first variant is '1' but the
511  // second is '1/0'. ret = 1 but nploidy should be 2
512  v.resize(nploidy * nsamples);
513  }
514  else
515  {
516  v.resize(ret);
517  nploidy = ret / nsamples;
518  }
519  // work with nploidy == 1, haploid?
520  isGenoMissing.assign(nsamples, 0);
521  int i, j, nphased = 0;
522  noneMissing = true;
523  fmt = bcf_get_fmt(header->hdr, line.get(), "GT");
524  int nploidy_cur = ret / nsamples; // requires nploidy_cur <= nploidy
525  for(i = 0; i < nsamples; i++)
526  {
527  // check and fill in typeOfGT; only supports SNPs now. check vcfstats.c for inspiration
528  typeOfGT[i] = bcf_gt_type(fmt, i, NULL, NULL);
529  if(typeOfGT[i] == GT_UNKN)
530  {
531  noneMissing = false; // set missing as het, user should check if isNoneMissing();
532  isGenoMissing[i] = 1;
533  v[i * nploidy + 0] = 1;
534  for(j = 1; j < nploidy_cur; j++) v[i * nploidy + j] = 0;
535  continue;
536  }
537 
538  for(j = 0; j < nploidy_cur; j++)
539  {
540  // TODO: right now only parse 0 and 1, ie max(nploidy)=2; other values 2,3... will be set to 1
541  v[i * nploidy + j] = bcf_gt_allele(gts[j + i * nploidy_cur]) != 0;
542  }
543  if(nploidy == 2)
544  {
545  gtPhase[i] = (gts[1 + i * nploidy_cur] & 1) == 1;
546  nphased += gtPhase[i];
547  }
548  }
549  if(nphased == nsamples)
550  {
551  isAllPhased = true;
552  }
553  else
554  {
555  isAllPhased = false;
556  }
557  return true;
558  }
559 
567  bool getGenotypes(std::vector<int> & v)
568  {
569  ndst = 0;
570  ret = bcf_get_genotypes(header->hdr, line.get(), &gts, &ndst);
571  if(ret <= 0)
572  {
573 # if defined(VERBOSE)
574  std::cerr << "GT not present for current site. did you initilize the variant object?\n";
575 # endif
576  return false;
577  }
578  v.resize(ret);
579  isGenoMissing.assign(nsamples, 0);
580  nploidy = ret / nsamples;
581  int i, j, nphased = 0;
582  noneMissing = true;
583  for(i = 0; i < nsamples; i++)
584  {
585  int32_t * ptr = gts + i * nploidy;
586  int is_phased = 0;
587  for(j = 0; j < nploidy; j++)
588  {
589  // if true, the sample has smaller ploidy
590  if(ptr[j] == bcf_int32_vector_end) break;
591  // missing allele
592  if(bcf_gt_is_missing(ptr[j]))
593  {
594  noneMissing = false;
595  isGenoMissing[i] = 1;
596  v[i * nploidy + j] = -9;
597  continue;
598  }
599  v[i * nploidy + j] = bcf_gt_allele(ptr[j]);
600  is_phased += bcf_gt_is_phased(ptr[j]);
601  }
602  if(nploidy == is_phased)
603  {
604  gtPhase[i] = true;
605  nphased += gtPhase[i];
606  }
607  }
608  if(nphased == nsamples)
609  {
610  isAllPhased = true;
611  }
612  else
613  {
614  isAllPhased = false;
615  }
616  return true;
617  }
618 
625  template<typename T, typename S = typename T::value_type>
626  isFormatVector<T> getFORMAT(std::string tag, T & v)
627  {
628  fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str());
629  if(!fmt)
630  {
631  throw std::invalid_argument("no FORMAT=" + tag + " in the VCF header.\n");
632  }
633  nvalues = fmt->n;
634  ndst = 0;
635  S * dst = NULL;
636  int tagid = header->getFormatType(tag);
637  if(tagid == 1)
638  {
639  ret = bcf_get_format_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
640  }
641  else if(tagid == 2)
642  {
643  ret = bcf_get_format_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
644  }
645  else if(tagid == 3)
646  {
647  ret = bcf_get_format_char(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
648  }
649  else
650  {
651  ret = -1;
652  }
653 
654  if(ret >= 0)
655  {
656  // user have to check if there is missing in the return v;
657  v = std::vector<S>(dst, dst + ret);
658  free(dst);
659  return true;
660  }
661  else
662  {
663  free(dst);
664  return false;
665  }
666  }
667 
674  bool getFORMAT(std::string tag, std::vector<std::string> & v)
675  {
676  fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str());
677  if(!fmt)
678  {
679  throw std::invalid_argument("no FORMAT=" + tag + " in the VCF header.\n");
680  }
681  nvalues = fmt->n;
682  // if ndst < (fmt->n+1)*nsmpl; then realloc is involved
683  ret = -1, ndst = 0;
684  char ** dst = NULL;
685  if(header->getFormatType(tag) == 3)
686  ret = bcf_get_format_string(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
687  if(ret > 0)
688  {
689  v.clear();
690  for(int i = 0; i < nsamples; i++)
691  {
692  // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
693  v.emplace_back(dst[i]);
694  };
695  free(dst[0]);
696  free(dst);
697  return true;
698  }
699  else
700  {
701  free(dst[0]);
702  free(dst);
703  return false;
704  }
705  }
706 
713  template<typename T, typename S = typename T::value_type>
714  isInfoVector<T> getINFO(std::string tag, T & v)
715  {
716  info = bcf_get_info(header->hdr, line.get(), tag.c_str());
717  if(!info)
718  {
719  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
720  }
721  S * dst = NULL;
722  ndst = 0;
723  ret = -1;
724  if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
725  {
726  ret = bcf_get_info_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
727  }
728  else if(info->type == BCF_BT_FLOAT)
729  {
730  ret = bcf_get_info_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
731  }
732  if(ret >= 0)
733  {
734  v = std::vector<S>(dst, dst + ret); // user have to check if there is missing in the return v;
735  free(dst);
736  return true;
737  }
738  else
739  {
740  free(dst);
741  return false;
742  }
743  }
744 
751  template<typename T>
752  isScalar<T> getINFO(std::string tag, T & v)
753  {
754  info = bcf_get_info(header->hdr, line.get(), tag.c_str());
755  if(!info)
756  {
757  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
758  }
759  // scalar value
760  if(info->len == 1)
761  {
762  if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
763  {
764  v = info->v1.i;
765  }
766  else if(info->type == BCF_BT_FLOAT)
767  {
768  v = info->v1.f;
769  }
770  return true;
771  }
772  else
773  {
774 # if defined(VERBOSE)
775  std::cerr << "there are multiple values for " + tag
776  + " in INFO for current site. please use vector instead\n";
777 # endif
778  return false;
779  }
780  }
781 
788  template<typename T>
789  isString<T> getINFO(std::string tag, T & v)
790  {
791  info = bcf_get_info(header->hdr, line.get(), tag.c_str());
792  if(!info)
793  {
794  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
795  }
796  if(info->type == BCF_BT_CHAR)
797  {
798  v = std::string(reinterpret_cast<char *>(info->vptr), info->vptr_len);
799  return true;
800  }
801  else
802  {
803  return false;
804  }
805  }
806 
813  template<typename T>
814  isScalar<T> setINFO(std::string tag, const T & v)
815  {
816  // bcf_hrec_set_val
817  // bcf_update_info_flag(header.hdr, line, tag.c_str(), NULL, 1);
818  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
819  if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
820  {
821  ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), &v, 1);
822  }
823  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
824  {
825  float v2 = static_cast<float>(v);
826  ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), &v2, 1);
827  }
828  else
829  {
830  ret = -1;
831  }
832  if(ret < 0)
833  {
834 # if defined(VERBOSE)
835  std::cerr << "couldn't set " + tag + " for this variant.\nplease add the tag in headerfirst.\n";
836 # endif
837  return false;
838  }
839  return true;
840  }
841 
848  template<typename T>
849  isValidInfo<T> setINFO(std::string tag, const T & v)
850  {
851  // bcf_update_info_flag(header.hdr, line, tag.c_str(), NULL, 1);
852  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
853  if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
854  {
855  ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
856  }
857  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
858  {
859  ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
860  }
861  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
862  {
863  ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), v.data());
864  }
865  else
866  {
867  ret = -1;
868  }
869 
870  if(ret < 0)
871  {
872 # if defined(VERBOSE)
873  std::cerr << "couldn't set " + tag + " for this variant.\nplease add the tag in headerfirst.\n";
874 # endif
875  return false;
876  }
877  return true;
878  }
879 
881  void removeINFO(std::string tag)
882  {
883  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
884  if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
885  {
886  ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), NULL, 0);
887  }
888  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
889  {
890  ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), NULL, 0);
891  }
892  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
893  {
894  ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), NULL);
895  }
896  else
897  {
898  ret = -1;
899  }
900 
901  if(ret < 0)
902  {
903  throw std::runtime_error("couldn't remove " + tag + " for this variant.\n");
904  }
905  }
906 
912  bool setGenotypes(const std::vector<int> & v)
913  {
914  // bcf_gt_type
915  int i, j, k;
916  nploidy = v.size() / nsamples;
917  int32_t * gt = (int32_t *)malloc(v.size() * sizeof(int32_t));
918  for(i = 0; i < nsamples; i++)
919  {
920  for(j = 0; j < nploidy; j++)
921  {
922  k = i * nploidy + j;
923  if(v[k] == -9 || v[k] == bcf_int32_missing)
924  {
925  gt[k] = bcf_gt_missing;
926  }
927  else if(gtPhase[i])
928  {
929  gt[k] = bcf_gt_phased(v[k]);
930  }
931  else
932  {
933  gt[k] = bcf_gt_unphased(v[k]);
934  }
935  }
936  }
937  if(bcf_update_genotypes(header->hdr, line.get(), gt, v.size()) < 0)
938  {
939  free(gt);
940 # if defined(VERBOSE)
941  std::cerr << "couldn't set genotypes correctly.\n";
942 # endif
943  return false;
944  }
945  free(gt);
946  return true;
947  }
948 
953  void setPhasing(const std::vector<char> & v)
954  {
955  assert((int)v.size() == nsamples);
956  gtPhase = v;
957  }
958 
960  void removeFORMAT(std::string tag)
961  {
962  ret = -1;
963  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
964  if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
965  {
966  ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), NULL, 0);
967  }
968  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
969  {
970  ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), NULL, 0);
971  }
972  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
973  {
974  ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), NULL, 0);
975  }
976 
977  if(ret < 0) throw std::runtime_error("couldn't remove " + tag + " correctly.\n");
978  }
979 
986  template<typename T>
987  isValidFMT<T> setFORMAT(std::string tag, const T & v)
988  {
989  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
990  if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
991  {
992  ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
993  }
994  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
995  {
996  ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
997  }
998  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
999  {
1000  ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
1001  }
1002  else
1003  {
1004  ret = -1;
1005  }
1006 
1007  if(ret < 0)
1008  {
1009 # if defined(VERBOSE)
1010  std::cerr << "couldn't set format " + tag + " corectly.\n";
1011 # endif
1012  return false;
1013  }
1014  return true;
1015  }
1016 
1024  template<typename T>
1025  isScalar<T> setFORMAT(std::string tag, const T & v)
1026  {
1027  float v2 = v;
1028  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
1029  if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1030  {
1031  ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), &v, 1);
1032  }
1033  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1034  {
1035  ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), &v2, 1);
1036  }
1037  else
1038  {
1039  ret = -1;
1040  }
1041  if(ret < 0)
1042  {
1043 # if defined(VERBOSE)
1044  std::cerr << "couldn't set format " + tag + " corectly.\n";
1045 # endif
1046  return false;
1047  }
1048  return true;
1049  }
1050 
1052  void addLineFromString(const std::string & vcfline)
1053  {
1054  kstring_t s = {0, 0, NULL};
1055  kputsn(vcfline.c_str(), vcfline.length(), &s);
1056  ret = vcf_parse1(&s, header->hdr, line.get());
1057  free(s.s);
1058  if(ret > 0) throw std::runtime_error("error parsing: " + vcfline + "\n");
1059  if(line->errcode == BCF_ERR_CTG_UNDEF)
1060  {
1061  std::string contig(bcf_hdr_id2name(header->hdr, line->rid));
1062  hdr_d = bcf_hdr_dup(header->hdr);
1063  header->hrec = bcf_hdr_id2hrec(hdr_d, BCF_DT_CTG, 0, line->rid);
1064  if(header->hrec == NULL)
1065  throw std::runtime_error("contig" + contig + " unknow and not found in the header.\n");
1066  ret = bcf_hdr_add_hrec(header->hdr, header->hrec);
1067  if(ret < 0) throw std::runtime_error("error adding contig " + contig + " to header.\n");
1068  ret = bcf_hdr_sync(header->hdr);
1069  }
1070  }
1071 
1073  inline bool isNoneMissing() const
1074  {
1075  return noneMissing;
1076  }
1077 
1079  inline bool isSV() const
1080  {
1081  if(bcf_get_info(header->hdr, line.get(), "SVTYPE") == NULL)
1082  {
1083  return false;
1084  }
1085  else
1086  {
1087  return true;
1088  }
1089  }
1090 
1092  inline bool isIndel() const
1093  {
1094  // REF has multiple allels
1095  if(REF().length() > 1 && !isSV()) return true;
1096  for(int i = 1; i < line->n_allele; i++)
1097  {
1098  std::string alt(line->d.allele[i]);
1099  if(alt == ".") return true;
1100  if(alt.length() != REF().length() && !isSV()) return true;
1101  }
1102  return false;
1103  }
1104 
1106  inline bool isMultiAllelics() const
1107  {
1108  if(line->n_allele <= 2) return false;
1109  return true;
1110  }
1111 
1113  inline bool isMultiAllelicSNP() const
1114  {
1115  // skip REF with length > 1, i.e. INDEL
1116  if(REF().length() > 1 || line->n_allele <= 2) return false;
1117  for(int i = 1; i < line->n_allele; i++)
1118  {
1119  std::string snp(line->d.allele[i]);
1120  if(snp.length() != 1)
1121  {
1122  return false;
1123  }
1124  }
1125  return true;
1126  }
1127 
1130  inline bool isSNP() const
1131  {
1132  // REF and ALT have multiple allels
1133  if(REF().length() > 1 || line->n_allele > 2) return false;
1134  std::string snp(line->d.allele[1]);
1135  if(!(snp == "A" || snp == "C" || snp == "G" || snp == "T"))
1136  {
1137  return false;
1138  }
1139  return true;
1140  }
1141 
1144  inline bool hasSNP() const
1145  {
1146  int type = bcf_has_variant_types(line.get(), VCF_SNP, bcf_match_overlap);
1147  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1148  if(type == 0) return false;
1149  return true;
1150  }
1151 
1153  inline bool hasINDEL() const
1154  {
1155  int type = bcf_has_variant_types(line.get(), VCF_INDEL, bcf_match_overlap);
1156  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1157  if(type == 0) return false;
1158  return true;
1159  }
1160 
1162  inline bool hasINS() const
1163  {
1164  int type = bcf_has_variant_types(line.get(), VCF_INS, bcf_match_overlap);
1165  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1166  if(type == 0) return false;
1167  return true;
1168  }
1169 
1171  inline bool hasDEL() const
1172  {
1173  int type = bcf_has_variant_types(line.get(), VCF_DEL, bcf_match_overlap);
1174  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1175  if(type == 0) return false;
1176  return true;
1177  }
1178 
1180  inline bool hasMNP() const
1181  {
1182  int type = bcf_has_variant_types(line.get(), VCF_MNP, bcf_match_overlap);
1183  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1184  if(type == 0) return false;
1185  return true;
1186  }
1187 
1189  inline bool hasBND() const
1190  {
1191  int type = bcf_has_variant_types(line.get(), VCF_BND, bcf_match_overlap);
1192  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1193  if(type == 0) return false;
1194  return true;
1195  }
1196 
1198  inline bool hasOTHER() const
1199  {
1200  int type = bcf_has_variant_types(line.get(), VCF_OTHER, bcf_match_overlap);
1201  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1202  if(type == 0) return false;
1203  return true;
1204  }
1205 
1207  inline bool hasOVERLAP() const
1208  {
1209  int type = bcf_has_variant_types(line.get(), VCF_OVERLAP, bcf_match_overlap);
1210  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1211  if(type == 0) return false;
1212  return true;
1213  }
1214 
1216  inline std::string CHROM() const
1217  {
1218  return std::string(bcf_hdr_id2name(header->hdr, line->rid));
1219  }
1220 
1222  inline std::string ID() const
1223  {
1224  return std::string(line->d.id);
1225  }
1226 
1228  inline int64_t POS() const
1229  {
1230  return line->pos + 1;
1231  }
1232 
1234  inline void setCHR(const std::string & s)
1235  {
1236  line->rid = bcf_hdr_name2id(header->hdr, s.c_str());
1237  }
1238 
1240  inline void setPOS(int64_t p)
1241  {
1242  line->pos = p - 1;
1243  }
1244 
1246  inline void setID(const std::string & s)
1247  {
1248  bcf_update_id(header->hdr, line.get(), s.c_str());
1249  }
1250 
1252  inline void setRefAlt(const std::string & s)
1253  {
1254  bcf_update_alleles_str(header->hdr, line.get(), s.c_str());
1255  }
1256 
1258  inline void setQUAL(float q)
1259  {
1260  line->qual = q;
1261  }
1262 
1264  inline void setQUAL(char q)
1265  {
1266  bcf_float_set_missing(line->qual);
1267  }
1268 
1270  inline void setFILTER(const std::string & s)
1271  {
1272  int32_t tmpi = bcf_hdr_id2int(header->hdr, BCF_DT_ID, s.c_str());
1273  bcf_add_filter(header->hdr, line.get(), tmpi);
1274  }
1275 
1277  inline int64_t Start() const
1278  {
1279  return line->pos;
1280  }
1281 
1283  inline int64_t End() const
1284  {
1285  return line->pos + line->rlen;
1286  }
1287 
1289  inline std::string REF() const
1290  {
1291  return std::string(line->d.allele[0]);
1292  }
1293 
1295  inline void swap_REF_ALT()
1296  {
1297  if(!isMultiAllelicSNP()) std::swap(line->d.allele[0], line->d.allele[1]);
1298  }
1299 
1301  inline std::string ALT() const
1302  {
1303  std::string s;
1304  for(int i = 1; i < line->n_allele; i++)
1305  {
1306  s += std::string(line->d.allele[i]) + ",";
1307  }
1308  if(s.length() > 1) s.pop_back();
1309  return s;
1310  }
1311 
1313  inline float QUAL()
1314  {
1315  if(bcf_float_is_missing(line->qual))
1316  {
1317  noneMissing = false;
1318  return bcf_float_missing;
1319  }
1320  else
1321  {
1322  return line->qual;
1323  }
1324  }
1325 
1327  inline std::string FILTER()
1328  {
1329  if(line->d.n_flt == 0)
1330  {
1331  return ".";
1332  }
1333  else if(line->d.n_flt == 1)
1334  {
1335  return std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[0]));
1336  }
1337  else
1338  {
1339  std::string s;
1340  for(int i = 1; i < line->d.n_flt; i++)
1341  {
1342  s += std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[i])) + ",";
1343  }
1344  s.pop_back();
1345  return s;
1346  }
1347  }
1348 
1350  inline std::string allINFO()
1351  {
1352  int32_t max_dt_id = header->hdr->n[BCF_DT_ID];
1353  kstring_t * s = (kstring_t *)calloc(1, sizeof(kstring_t));
1354  if(line->n_info)
1355  {
1356  int first = 1;
1357  for(int i = 0; i < line->n_info; ++i)
1358  {
1359  bcf_info_t * z = &line->d.info[i];
1360  if(!z->vptr) continue;
1361  if(!first) kputc(';', s);
1362  first = 0;
1363  if(z->key < 0 || z->key >= max_dt_id || header->hdr->id[BCF_DT_ID][z->key].key == NULL)
1364  throw std::runtime_error("Invalid BCF and wrong INFO tag");
1365  kputs(header->hdr->id[BCF_DT_ID][z->key].key, s);
1366  if(z->len <= 0) continue;
1367  kputc('=', s);
1368  if(z->len == 1)
1369  {
1370  switch(z->type)
1371  {
1372  case BCF_BT_INT8:
1373  if(z->v1.i == bcf_int8_missing)
1374  kputc('.', s);
1375  else
1376  kputw(z->v1.i, s);
1377  break;
1378  case BCF_BT_INT16:
1379  if(z->v1.i == bcf_int16_missing)
1380  kputc('.', s);
1381  else
1382  kputw(z->v1.i, s);
1383  break;
1384  case BCF_BT_INT32:
1385  if(z->v1.i == bcf_int32_missing)
1386  kputc('.', s);
1387  else
1388  kputw(z->v1.i, s);
1389  break;
1390  case BCF_BT_INT64:
1391  if(z->v1.i == bcf_int64_missing)
1392  kputc('.', s);
1393  else
1394  kputll(z->v1.i, s);
1395  break;
1396  case BCF_BT_FLOAT:
1397  if(bcf_float_is_missing(z->v1.f))
1398  kputc('.', s);
1399  else
1400  kputd(z->v1.f, s);
1401  break;
1402  case BCF_BT_CHAR:
1403  kputc(z->v1.i, s);
1404  break;
1405  default:
1406  throw std::runtime_error("Unexpected type in INFO");
1407  }
1408  }
1409  else
1410  bcf_fmt_array(s, z->len, z->type, z->vptr);
1411  }
1412  if(first) kputc('.', s);
1413  }
1414  else
1415  kputc('.', s);
1416  std::string out = std::string(s->s, s->l);
1417  free(s->s);
1418  free(s);
1419  return out;
1420  }
1421 
1423  inline bool allPhased() const
1424  {
1425  return isAllPhased;
1426  }
1427 
1429  inline int ploidy() const
1430  {
1431  return nploidy;
1432  }
1433 
1435  inline void setPloidy(int v)
1436  {
1437  nploidy = v;
1438  }
1439 
1451  std::vector<char> typeOfGT;
1452 
1454  std::vector<char> gtPhase;
1455 };
1456 
1462 {
1463  private:
1464  std::shared_ptr<htsFile> fp; // hts file
1465  hts_idx_t * hidx = NULL; // hts index file
1466  tbx_t * tidx = NULL; // .tbi .csi index file for vcf files
1467  hts_itr_t * itr = NULL; // tabix iterator
1468  kstring_t s = {0, 0, NULL}; // kstring
1469  std::string fname;
1470  bool isBcf; // if the input file is bcf or vcf;
1471 
1472  public:
1478  std::vector<std::string> SamplesName;
1479 
1482 
1487  BcfReader(const std::string & file) : fname(file)
1488  {
1489  open(file);
1490  }
1491 
1497  BcfReader(const std::string & file, const std::string & region) : fname(file)
1498  {
1499  open(file);
1500  if(!region.empty()) setRegion(region);
1502  }
1503 
1514  BcfReader(const std::string & file, const std::string & region, const std::string & samples) : fname(file)
1515  {
1516  open(file);
1517  if(!region.empty()) setRegion(region);
1518  if(!samples.empty()) setSamples(samples);
1519  }
1520 
1521  ~BcfReader()
1522  {
1523  close();
1524  }
1525 
1527  void close()
1528  {
1529  if(fp) fp.reset();
1530  if(itr) hts_itr_destroy(itr);
1531  if(hidx) hts_idx_destroy(hidx);
1532  if(tidx) tbx_destroy(tidx);
1533  if(s.s) free(s.s);
1534  }
1535 
1537  void open(const std::string & file)
1538  {
1539  fname = file;
1540  fp = std::shared_ptr<htsFile>(hts_open(file.c_str(), "r"), htsFile_close());
1541  if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
1542  header.hdr = bcf_hdr_read(fp.get());
1543  nsamples = bcf_hdr_nsamples(header.hdr);
1545  }
1546 
1548  inline int setThreads(int n)
1549  {
1550  return hts_set_threads(fp.get(), n);
1551  }
1552 
1554  const BcfHeader & getHeader() const
1555  {
1556  return header;
1557  }
1558 
1560  uint64_t getVariantsCount(BcfRecord & r, const std::string & region)
1561  {
1562  uint64_t c{0};
1563  while(getNextVariant(r)) c++;
1564  setRegion(region); // reset the region
1565  return c;
1566  }
1567 
1573  void setSamples(const std::string & samples)
1574  {
1575  header.setSamples(samples);
1576  nsamples = bcf_hdr_nsamples(header.hdr);
1578  }
1579 
1584  void setRegion(const std::string & region)
1585  {
1586  // 1. check and load index first
1587  // 2. query iterval region
1588  // 3. if region is empty, use "."
1589  if(isEndWith(fname, "bcf") || isEndWith(fname, "bcf.gz"))
1590  {
1591  isBcf = true;
1592  hidx = bcf_index_load(fname.c_str());
1593  if(itr) bcf_itr_destroy(itr); // reset current region.
1594  if(region.empty())
1595  itr = bcf_itr_querys(hidx, header.hdr, ".");
1596  else
1597  itr = bcf_itr_querys(hidx, header.hdr, region.c_str());
1598  }
1599  else
1600  {
1601  isBcf = false;
1602  tidx = tbx_index_load(fname.c_str());
1603  assert(tidx != NULL && "error loading tabix index!");
1604  if(itr) tbx_itr_destroy(itr); // reset current region.
1605  if(region.empty())
1606  itr = tbx_itr_querys(tidx, ".");
1607  else
1608  itr = tbx_itr_querys(tidx, region.c_str());
1609  assert(itr != NULL && "no interval region found.failed!");
1610  }
1611  }
1612 
1616  {
1617  int ret = -1;
1618  if(itr != NULL)
1619  {
1620  if(isBcf)
1621  {
1622  ret = bcf_itr_next(fp.get(), itr, r.line.get());
1623  bcf_unpack(r.line.get(), BCF_UN_ALL);
1624  return (ret >= 0);
1625  }
1626  else
1627  {
1628  int slen = tbx_itr_next(fp.get(), tidx, itr, &s);
1629  if(slen > 0)
1630  {
1631  ret = vcf_parse1(&s, r.header->hdr, r.line.get()); // ret > 0, error
1632  bcf_unpack(r.line.get(), BCF_UN_ALL);
1633  }
1634  return (ret <= 0) && (slen > 0);
1635  }
1636  }
1637  else
1638  {
1639  ret = bcf_read(fp.get(), r.header->hdr, r.line.get());
1640  // unpack record immediately. not lazy
1641  bcf_unpack(r.line.get(), BCF_UN_ALL);
1642  return (ret == 0);
1643  }
1644  }
1645 };
1646 
1652 {
1653  private:
1654  std::shared_ptr<htsFile> fp; // hts file
1655  std::shared_ptr<bcf1_t> b = std::shared_ptr<bcf1_t>(bcf_init(), variant_close()); // variant
1656  int ret;
1657  bool isHeaderWritten = false;
1658  const BcfHeader * hp;
1659 
1660  public:
1663 
1666 
1672  BcfWriter(const std::string & fname, std::string version = "VCF4.1")
1673  {
1674  open(fname);
1675  initalHeader(version);
1677  }
1678 
1684  BcfWriter(const std::string & fname, const BcfHeader & h)
1685  {
1686  open(fname);
1687  initalHeader(h);
1688  }
1689 
1700  BcfWriter(const std::string & fname, const std::string & version, const std::string & mode)
1701  {
1702  open(fname, mode);
1703  initalHeader(version);
1705  }
1706 
1717  BcfWriter(const std::string & fname, const BcfHeader & h, const std::string & mode)
1718  {
1719  open(fname, mode);
1720  initalHeader(h);
1721  }
1722 
1723  ~BcfWriter() {}
1724 
1729  void open(const std::string & fname)
1730  {
1731  auto mode = getMode(fname, "w");
1732  fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), htsFile_close());
1733  if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
1734  }
1735 
1745  void open(const std::string & fname, const std::string & mode)
1746  {
1747  fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), htsFile_close());
1748  if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
1749  }
1750 
1752  void close()
1753  {
1754  if(!isHeaderWritten) writeHeader();
1755  if(b) b.reset();
1756  if(fp) fp.reset();
1757  }
1758 
1760  void initalHeader(std::string version = "VCF4.1")
1761  {
1762  header.hdr = bcf_hdr_init("w");
1763  header.setVersion(version);
1764  }
1765 
1767  void initalHeader(const BcfHeader & h)
1768  {
1769  hp = &h;
1770  }
1771 
1773  void copyHeader(const std::string & vcffile, std::string samples = "-")
1774  {
1775  htsFile * fp2 = hts_open(vcffile.c_str(), "r");
1776  if(!fp2) throw std::invalid_argument("I/O error: input file is invalid");
1777  if(samples == "")
1778  { // site-only
1779  bcf_hdr_t * hfull = bcf_hdr_read(fp2);
1780  header.hdr = bcf_hdr_subset(hfull, 0, 0, 0);
1781  bcf_hdr_remove(header.hdr, BCF_HL_FMT, NULL);
1782  bcf_hdr_destroy(hfull);
1783  }
1784  else
1785  {
1786  header.hdr = bcf_hdr_read(fp2);
1787  header.setSamples(samples);
1788  }
1789  hts_close(fp2);
1791  }
1792 
1794  void writeLine(const std::string & vcfline)
1795  {
1796  if(!isHeaderWritten && !writeHeader()) throw std::runtime_error("could not write header\n");
1797  kstring_t s = {0, 0, NULL};
1798  kputsn(vcfline.c_str(), vcfline.length(), &s);
1799  ret = vcf_parse1(&s, hp->hdr, b.get());
1800  free(s.s);
1801  if(ret > 0) throw std::runtime_error("error parsing: " + vcfline + "\n");
1802  if(b->errcode == BCF_ERR_CTG_UNDEF)
1803  {
1804  throw std::runtime_error("contig id " + (std::string)bcf_hdr_id2name(hp->hdr, b->rid)
1805  + " not found in the header. please run header->AddContig() first.\n");
1806  }
1807  ret = bcf_write(fp.get(), hp->hdr, b.get());
1808  if(ret != 0) throw std::runtime_error("error writing: " + vcfline + "\n");
1809  }
1810 
1813  {
1814  ret = bcf_hdr_write(fp.get(), hp->hdr);
1815  if(ret == 0) return isHeaderWritten = true;
1816  return false;
1817  }
1818 
1820  inline bool writeRecord(BcfRecord & v)
1821  {
1822  if(!isHeaderWritten) writeHeader();
1823  if(bcf_write(fp.get(), v.header->hdr, v.line.get()) < 0) return false;
1824  return true;
1825  }
1826 };
1827 
1828 } // namespace vcfpp
1829 
1830 #endif // VCFPP_H_
Object represents header of the VCF, offering methods to access and modify the tags.
Definition: vcfpp.h:184
std::string asString() const
return header as a string
Definition: vcfpp.h:282
void removeFormat(std::string tag) const
remove a FORMAT tag from header
Definition: vcfpp.h:363
void addContig(const std::string &id)
add contig to header
Definition: vcfpp.h:254
void addFILTER(const std::string &id, const std::string &description)
add one FILTER line to header
Definition: vcfpp.h:246
int getFormatType(std::string tag) const
get the type of a given tag
Definition: vcfpp.h:328
void addSample(const std::string &sample) const
add one sample name to header
Definition: vcfpp.h:272
void addFORMAT(const std::string &id, const std::string &number, const std::string &type, const std::string &description)
add FORMAT field to header
Definition: vcfpp.h:232
void setVersion(const std::string &version) const
set the VCF version
Definition: vcfpp.h:390
void removeInfo(std::string tag) const
remove a INFO tag from header
Definition: vcfpp.h:357
void setSamples(const std::string &samples) const
explicitly set samples to be extracted
Definition: vcfpp.h:378
void addLine(const std::string &str)
add one line to header
Definition: vcfpp.h:262
std::vector< std::string > getSeqnames() const
return all contig/chromosome names in a string vector
Definition: vcfpp.h:309
void removeContig(std::string tag) const
remove a contig tag from header
Definition: vcfpp.h:351
std::vector< std::string > getSamples() const
return all samples in a string vector
Definition: vcfpp.h:298
int nSamples() const
return the number of samples in the VCF
Definition: vcfpp.h:396
friend std::ostream & operator<<(std::ostream &out, const BcfHeader &h)
stream out the header
Definition: vcfpp.h:203
void removeFilter(std::string tag) const
remove a FILTER tag from header
Definition: vcfpp.h:369
void addINFO(const std::string &id, const std::string &number, const std::string &type, const std::string &description)
add INFO field to header
Definition: vcfpp.h:217
Stream in variants from compressed/uncompressed VCF/BCF file or stdin.
Definition: vcfpp.h:1462
void setSamples(const std::string &samples)
explicitly stream to specific samples
Definition: vcfpp.h:1573
void open(const std::string &file)
open a VCF/BCF/STDIN file for streaming in
Definition: vcfpp.h:1537
BcfReader(const std::string &file, const std::string &region, const std::string &samples)
construct a vcf/bcf reader with subset samples in target region
Definition: vcfpp.h:1514
BcfHeader header
a BcfHeader object
Definition: vcfpp.h:1474
int setThreads(int n)
set the number of threads to use
Definition: vcfpp.h:1548
uint64_t getVariantsCount(BcfRecord &r, const std::string &region)
get the number of records of given region
Definition: vcfpp.h:1560
BcfReader(const std::string &file, const std::string &region)
construct a vcf/bcf reader with subset samples
Definition: vcfpp.h:1497
bool getNextVariant(BcfRecord &r)
read in the next variant
Definition: vcfpp.h:1615
BcfReader()
Construct an empty BcfReader.
Definition: vcfpp.h:1481
int nsamples
number of samples in the VCF
Definition: vcfpp.h:1476
const BcfHeader & getHeader() const
return a BcfHeader object
Definition: vcfpp.h:1554
BcfReader(const std::string &file)
construct a vcf/bcf reader from file.
Definition: vcfpp.h:1487
void close()
close the BcfReader object.
Definition: vcfpp.h:1527
void setRegion(const std::string &region)
explicitly stream to specific region
Definition: vcfpp.h:1584
std::vector< std::string > SamplesName
number of samples in the VCF
Definition: vcfpp.h:1478
Object represents a variant record in the VCF, offering methods to access and modify fields.
Definition: vcfpp.h:409
float QUAL()
return QUAL value
Definition: vcfpp.h:1313
bool isIndel() const
return boolean value indicates if current variant is exclusively INDEL
Definition: vcfpp.h:1092
bool hasINDEL() const
return boolean value indicates if current variant has INDEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1153
void setRefAlt(const std::string &s)
set REF and ALT alleles given a string seperated by comma
Definition: vcfpp.h:1252
isValidGT< T > getGenotypes(T &v)
fill in the input vector with genotypes of 0 and 1. only works for ploidy<=2. Genotypes with missing ...
Definition: vcfpp.h:494
void setPOS(int64_t p)
modify position given 1-based value
Definition: vcfpp.h:1240
bool hasINS() const
return boolean value indicates if current variant has INS type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1162
BcfRecord()
empty constructor. call init() afterwards
Definition: vcfpp.h:432
std::string allINFO()
return raw INFO column as string. recommend to use getINFO for specific tag.
Definition: vcfpp.h:1350
int ploidy() const
return the number of ploidy of current variant
Definition: vcfpp.h:1429
void removeINFO(std::string tag)
remove the given tag from INFO of the variant
Definition: vcfpp.h:881
BcfRecord(BcfHeader &h)
constructor with a given BcfHeader object
Definition: vcfpp.h:435
std::string CHROM() const
return CHROM name
Definition: vcfpp.h:1216
void setID(const std::string &s)
update ID
Definition: vcfpp.h:1246
isValidFMT< T > setFORMAT(std::string tag, const T &v)
set tag values for all samples in FORMAT using given vector
Definition: vcfpp.h:987
int64_t POS() const
return 1-base position
Definition: vcfpp.h:1228
std::string ID() const
return ID field
Definition: vcfpp.h:1222
isScalar< T > setFORMAT(std::string tag, const T &v)
set tag for a single sample in FORMAT using given singular value. this works only when there is one s...
Definition: vcfpp.h:1025
bool isSV() const
return boolean value indicates if current variant is Structual Variant or not
Definition: vcfpp.h:1079
void setQUAL(float q)
modify the QUAL value
Definition: vcfpp.h:1258
bool hasSNP() const
return boolean value indicates if current variant has SNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1144
std::vector< char > isGenoMissing
if there is "." in GT for the sample, then it's coded as missing (TRUE)
Definition: vcfpp.h:428
std::string FILTER()
return raw FILTER column as string
Definition: vcfpp.h:1327
void swap_REF_ALT()
swap REF and ALT for biallelic SNP
Definition: vcfpp.h:1295
std::string REF() const
return raw REF alleles as string
Definition: vcfpp.h:1289
std::string asString() const
return current variant as raw string
Definition: vcfpp.h:470
int64_t Start() const
return 0-base start of the variant (can be any type)
Definition: vcfpp.h:1277
bool hasBND() const
return boolean value indicates if current variant has BND type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1189
friend std::ostream & operator<<(std::ostream &out, const BcfRecord &v)
stream out the variant
Definition: vcfpp.h:463
isScalar< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:814
void resetHeader(BcfHeader &h)
reset the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:457
bool hasMNP() const
return boolean value indicates if current variant has MNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1180
bool allPhased() const
return boolean value indicates if genotypes of all samples are phased
Definition: vcfpp.h:1423
bool hasDEL() const
return boolean value indicates if current variant has DEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1171
bool isMultiAllelicSNP() const
return boolean value indicates if current variant is exclusively multiallelic SNP sites
Definition: vcfpp.h:1113
void setPhasing(const std::vector< char > &v)
set phasing status for all diploid samples using given vector
Definition: vcfpp.h:953
bool setGenotypes(const std::vector< int > &v)
set genotypes from scratch even if genotypes not present
Definition: vcfpp.h:912
std::string ALT() const
return raw ALT alleles as string
Definition: vcfpp.h:1301
bool hasOTHER() const
return boolean value indicates if current variant has OTHER type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1198
isString< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:789
bool isSNP() const
return boolean value indicates if current variant is exclusively biallelic SNP. Note ALT=* are skippe...
Definition: vcfpp.h:1130
bool isMultiAllelics() const
return boolean value indicates if current variant is multiallelic sites
Definition: vcfpp.h:1106
void setCHR(const std::string &s)
modify CHROM value
Definition: vcfpp.h:1234
int64_t End() const
return 0-base end of the variant (can be any type)
Definition: vcfpp.h:1283
void setPloidy(int v)
in a rare case, one may want to set the number of ploidy manually
Definition: vcfpp.h:1435
isFormatVector< T > getFORMAT(std::string tag, T &v)
get tag value in FORMAT
Definition: vcfpp.h:626
void initHeader(BcfHeader &h)
initilize the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:447
void addLineFromString(const std::string &vcfline)
add one variant record from given string
Definition: vcfpp.h:1052
bool getFORMAT(std::string tag, std::vector< std::string > &v)
get tag value in FORMAT
Definition: vcfpp.h:674
bool isNoneMissing() const
if all samples have non missing values for the tag in FORMAT
Definition: vcfpp.h:1073
void setQUAL(char q)
modify the QUAL value
Definition: vcfpp.h:1264
void setFILTER(const std::string &s)
modify the FILTER value
Definition: vcfpp.h:1270
isValidInfo< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:849
std::vector< char > gtPhase
vector of nsamples length. keep track of the phasing status of each sample
Definition: vcfpp.h:1454
void removeFORMAT(std::string tag)
remove the given tag from FORMAT of the variant
Definition: vcfpp.h:960
std::vector< char > typeOfGT
vector of nsamples length. keep track of the type of genotype (one of GT_HOM_RR, GT_HET_RA,...
Definition: vcfpp.h:1451
isInfoVector< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:714
bool getGenotypes(std::vector< int > &v)
fill in the input vector with genotype values, 0, 1 or -9 (missing).
Definition: vcfpp.h:567
bool hasOVERLAP() const
return boolean value indicates if current variant has OVERLAP type defined in vcf....
Definition: vcfpp.h:1207
isScalar< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:752
Stream out variants to compressed/uncompressed VCF/BCF file or stdout.
Definition: vcfpp.h:1652
void close()
close the BcfWriter object.
Definition: vcfpp.h:1752
void writeLine(const std::string &vcfline)
copy a string to a vcf line
Definition: vcfpp.h:1794
BcfWriter(const std::string &fname, const std::string &version, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1700
void copyHeader(const std::string &vcffile, std::string samples="-")
copy header of given VCF and restrict on samples. if samples=="", FORMAT removed and only sites left
Definition: vcfpp.h:1773
BcfWriter(const std::string &fname, std::string version="VCF4.1")
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1672
void open(const std::string &fname)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1729
BcfWriter(const std::string &fname, const BcfHeader &h)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1684
void initalHeader(const BcfHeader &h)
initial a VCF header by pointing to header of another VCF
Definition: vcfpp.h:1767
BcfWriter(const std::string &fname, const BcfHeader &h, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1717
void initalHeader(std::string version="VCF4.1")
initial a VCF header using the internal template given a specific version. VCF4.1 is the default
Definition: vcfpp.h:1760
BcfHeader header
header object initialized by initalHeader
Definition: vcfpp.h:1662
BcfWriter()
Construct an empty BcfWriter.
Definition: vcfpp.h:1665
void open(const std::string &fname, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1745
bool writeRecord(BcfRecord &v)
streams out the given variant of BcfRecord type
Definition: vcfpp.h:1820
bool writeHeader()
streams out the header
Definition: vcfpp.h:1812
Definition: vcfpp.h:171
Definition: vcfpp.h:153
Definition: vcfpp.h:162