vcfpp
vcfpp.h
1 /*******************************************************************************
2  * @file https://github.com/Zilong-Li/vcfpp/vcfpp.h
3  * @author Zilong Li
4  * @email zilong.dk@gmail.com
5  * @version v0.4.0
6  * @breif a single C++ file for manipulating VCF
7  * Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file.
8  ******************************************************************************/
9 
36 #pragma once
37 
38 #ifndef VCFPP_H_
39 # define VCFPP_H_
40 
41 # include <iostream>
42 # include <memory>
43 # include <string>
44 # include <type_traits>
45 # include <vector>
46 
47 // make sure you have htslib installed
48 extern "C"
49 {
50 # include <htslib/kstring.h>
51 # include <htslib/tbx.h>
52 # include <htslib/vcf.h>
53 # include <htslib/vcfutils.h>
54 }
55 
56 namespace vcfpp
57 {
58 template<typename T>
59 using isValidFMT =
60  typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<char>>::value
61  || std::is_same<T, std::vector<int>>::value
62  || std::is_same<T, std::vector<float>>::value,
63  bool>::type;
64 
65 template<typename T>
66 using isValidInfo =
67  typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<int>>::value
68  || std::is_same<T, std::vector<float>>::value,
69  bool>::type;
70 
71 template<typename T>
72 using isInfoVector = typename std::enable_if<std::is_same<T, std::vector<int>>::value
73  || std::is_same<T, std::vector<float>>::value,
74  bool>::type;
75 
76 template<typename T>
77 using isScalar = typename std::enable_if<std::is_same<T, int>::value || std::is_same<T, float>::value
78  || std::is_same<T, double>::value,
79  bool>::type;
80 
81 template<typename T>
82 using isString = typename std::enable_if<std::is_same<T, std::string>::value, bool>::type;
83 
84 template<typename T>
85 using isValidGT = typename std::enable_if<std::is_same<T, std::vector<bool>>::value
86  || std::is_same<T, std::vector<char>>::value,
87  bool>::type;
88 
89 template<typename T>
90 using isFormatVector = typename std::enable_if<std::is_same<T, std::vector<float>>::value
91  || std::is_same<T, std::vector<char>>::value
92  || std::is_same<T, std::vector<int>>::value,
93  bool>::type;
94 
95 template<typename T>
96 isScalar<T> isMissing(T const & v)
97 {
98  if(std::is_same<T, float>::value)
99  {
100  return bcf_float_is_missing(v);
101  }
102  else if(std::is_same<T, int>::value)
103  {
104  return bcf_int32_missing(v);
105  }
106 }
107 
108 // test if a string is end with another string
109 inline bool isEndWith(std::string const & s, std::string const & e)
110 {
111  if(s.length() >= e.length())
112  {
113  return (0 == s.compare(s.length() - e.length(), e.length(), e));
114  }
115  else
116  {
117  return false;
118  }
119 }
120 
121 // determinate the mode for read/write the compressed/uncompressed VCF/BCF
122 inline std::string getMode(std::string const & fname, std::string mode = "r")
123 {
124  if(isEndWith(fname, "bcf.gz")) mode += "b";
125  if(isEndWith(fname, "bcf")) mode += "bu";
126  if(isEndWith(fname, "vcf.gz")) mode += "z";
127  return mode;
128 }
129 
130 // string split by separator
131 inline std::vector<std::string> split_string(const std::string & s, const std::string & separators)
132 {
133  std::vector<std::string> ret;
134  bool is_seperator[256] = {false};
135  for(auto & ch : separators)
136  {
137  is_seperator[(unsigned int)ch] = true;
138  }
139  int begin = 0;
140  for(int i = 0; i <= (int)s.size(); i++)
141  {
142  if(is_seperator[(uint8_t)s[i]] || i == (int)s.size())
143  {
144  ret.push_back(std::string(s.begin() + begin, s.begin() + i));
145  begin = i + 1;
146  }
147  }
148  return ret;
149 }
150 
151 // deleter for htsFile
153 {
154  void operator()(htsFile * x)
155  {
156  if(x) hts_close(x);
157  }
158 };
159 
160 // deleter for variant
162 {
163  void operator()(bcf1_t * x)
164  {
165  if(x) bcf_destroy(x);
166  }
167 };
168 
169 // deleter for bcf header
171 {
172  void operator()(bcf_hdr_t * x)
173  {
174  if(x) bcf_hdr_destroy(x);
175  }
176 };
177 
184 {
185  friend class BcfRecord;
186  friend class BcfReader;
187  friend class BcfWriter;
188 
189  private:
190  bcf_hdr_t * hdr = NULL; // bcf header
191  bcf_hrec_t * hrec = NULL; // populate header
192 
193  public:
194  BcfHeader() {}
195 
196  ~BcfHeader()
197  {
198  if(hrec) bcf_hrec_destroy(hrec);
199  if(hdr) bcf_hdr_destroy(hdr);
200  }
201 
203  friend std::ostream & operator<<(std::ostream & out, const BcfHeader & h)
204  {
205  out << h.asString();
206  return out;
207  }
208 
209  // TODO: check if the value is valid for vcf specification
210 
217  inline void addINFO(const std::string & id,
218  const std::string & number,
219  const std::string & type,
220  const std::string & description)
221  {
222  addLine("##INFO=<ID=" + id + ",Number=" + number + ",Type=" + type + ",Description=\"" + description
223  + "\">");
224  }
225 
232  inline void addFORMAT(const std::string & id,
233  const std::string & number,
234  const std::string & type,
235  const std::string & description)
236  {
237  addLine("##FORMAT=<ID=" + id + ",Number=" + number + ",Type=" + type + ",Description=\"" + description
238  + "\">");
239  }
240 
246  inline void addFILTER(const std::string & id, const std::string & description)
247  {
248  addLine("##FILTER=<ID=" + id + ",Description=\"" + description + "\">");
249  }
250 
254  inline void addContig(const std::string & id)
255  {
256  addLine("##contig=<ID=" + id + ">");
257  }
258 
262  inline void addLine(const std::string & str)
263  {
264  int ret = 0;
265  ret = bcf_hdr_append(hdr, str.c_str());
266  if(ret != 0) throw std::runtime_error("could not add " + str + " to header\n");
267  ret = bcf_hdr_sync(hdr);
268  if(ret != 0) throw std::runtime_error("could not add " + str + " to header\n");
269  }
270 
272  inline void addSample(const std::string & sample) const
273  {
274  bcf_hdr_add_sample(hdr, sample.c_str());
275  if(bcf_hdr_sync(hdr) != 0)
276  {
277  throw std::runtime_error("couldn't add the sample.\n");
278  }
279  }
280 
282  inline std::string asString() const
283  {
284  kstring_t s = {0, 0, NULL}; // kstring
285  if(bcf_hdr_format(hdr, 0, &s) == 0) // append header string to s.s! append!
286  {
287  std::string out(s.s, s.l);
288  free(s.s);
289  return out;
290  }
291  else
292  {
293  throw std::runtime_error("failed to convert formatted header to string");
294  }
295  }
296 
298  std::vector<std::string> getSamples() const
299  {
300  std::vector<std::string> vec;
301  for(int i = 0; i < bcf_hdr_nsamples(hdr); i++)
302  {
303  vec.push_back(std::string(hdr->samples[i]));
304  }
305  return vec;
306  }
307 
309  std::vector<std::string> getSeqnames() const
310  {
311  int ret = 0;
312  const char ** seqs = bcf_hdr_seqnames(hdr, &ret);
313  if(ret == 0) printf("there is no contig id in the header!\n");
314  std::vector<std::string> vec;
315  for(int i = 0; i < ret; i++)
316  {
317  vec.push_back(std::string(seqs[i]));
318  }
319  free(seqs);
320  return vec;
321  }
322 
328  inline int getFormatType(std::string tag) const
329  {
330  int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag.c_str());
331  if(tag_id < 0) return 0;
332  if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
333  {
334  return 1;
335  }
336  else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
337  {
338  return 2;
339  }
340  else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
341  {
342  return 3;
343  }
344  else
345  {
346  return 0;
347  }
348  }
349 
351  inline void removeContig(std::string tag) const
352  {
353  bcf_hdr_remove(hdr, BCF_HL_CTG, tag.c_str());
354  }
355 
357  inline void removeInfo(std::string tag) const
358  {
359  bcf_hdr_remove(hdr, BCF_HL_INFO, tag.c_str());
360  }
361 
363  inline void removeFormat(std::string tag) const
364  {
365  bcf_hdr_remove(hdr, BCF_HL_FMT, tag.c_str());
366  }
367 
369  inline void removeFilter(std::string tag) const
370  {
371  bcf_hdr_remove(hdr, BCF_HL_FLT, tag.c_str());
372  }
373 
378  inline void setSamples(const std::string & samples) const
379  {
380  int ret = 0;
381  ret = bcf_hdr_set_samples(hdr, samples.c_str(), 0);
382  if(ret != 0)
383  {
384  throw std::runtime_error("the " + std::to_string(ret)
385  + "-th sample are not in the VCF.\nparameter samples:" + samples);
386  }
387  }
388 
390  inline void setVersion(const std::string & version) const
391  {
392  bcf_hdr_set_version(hdr, version.c_str());
393  }
394 
396  inline int nSamples() const
397  {
398  return bcf_hdr_nsamples(hdr);
399  }
400 };
401 
409 {
410  friend class BcfReader;
411  friend class BcfWriter;
412 
413  private:
414  BcfHeader * header;
415  std::shared_ptr<bcf1_t> line = std::shared_ptr<bcf1_t>(bcf_init(), variant_close()); // variant
416  bcf_hdr_t * hdr_d = NULL; // a dup header by bcf_hdr_dup(header->hdr)
417  bcf_fmt_t * fmt = NULL;
418  bcf_info_t * info = NULL;
419  int32_t * gts = NULL;
420  int ndst, ret, nsamples;
421  bool noneMissing = true; // whenever parsing a tag have to reset this variable
422  bool isAllPhased = false;
423  int nploidy = 0; // the number of ploidy
424  int nvalues = 0; // the number of values for a tag in FORMAT
425 
426  public:
428  std::vector<char> isGenoMissing;
429 
430  public:
433 
436  {
437  initHeader(h);
438  }
439 
440  ~BcfRecord()
441  {
442  if(gts) free(gts);
443  if(hdr_d) bcf_hdr_destroy(hdr_d);
444  }
445 
448  {
449  header = &h;
450  if(!header->hdr) throw std::runtime_error("please initial header first\n");
451  nsamples = header->nSamples();
452  typeOfGT.resize(nsamples);
453  gtPhase.resize(nsamples, 0);
454  }
455 
458  {
459  header = &h;
460  }
461 
463  friend std::ostream & operator<<(std::ostream & out, const BcfRecord & v)
464  {
465  out << v.asString();
466  return out;
467  }
468 
470  inline std::string asString() const
471  {
472  kstring_t s = {0, 0, NULL}; // kstring
473  if(vcf_format(header->hdr, line.get(), &s) == 0)
474  {
475  std::string out(s.s, s.l);
476  free(s.s);
477  return out;
478  }
479  else
480  {
481  throw std::runtime_error("couldn't format current record into a string.\n");
482  }
483  }
484 
493  template<typename T>
494  isValidGT<T> getGenotypes(T & v)
495  {
496  ndst = 0;
497  ret = bcf_get_genotypes(header->hdr, line.get(), &gts, &ndst);
498  if(ret <= 0)
499  {
500 # if defined(VERBOSE)
501  std::cerr << "GT not present for current site. did you initilize the variant object?\n";
502 # endif
503  return false;
504  }
505  // if nploidy is not set manually. find the max nploidy using the first variant (eg. 2) resize v as
506  // max(nploidy)
507  // * nsamples (ret) NOTE: if ret == nsamples and only one sample then haploid
508  if(ret != nploidy * nsamples && nploidy > 0)
509  {
510  // rare case if nploidy is set manually. eg. only one sample. the first variant is '1' but the
511  // second is '1/0'. ret = 1 but nploidy should be 2
512  v.resize(nploidy * nsamples);
513  }
514  else
515  {
516  v.resize(ret);
517  nploidy = ret / nsamples;
518  }
519  // work with nploidy == 1, haploid?
520  isGenoMissing.assign(nsamples, 0);
521  int i, j, nphased = 0;
522  noneMissing = true;
523  fmt = bcf_get_fmt(header->hdr, line.get(), "GT");
524  int nploidy_cur = ret / nsamples; // requires nploidy_cur <= nploidy
525  for(i = 0; i < nsamples; i++)
526  {
527  // check and fill in typeOfGT; only supports SNPs now. check vcfstats.c for inspiration
528  typeOfGT[i] = bcf_gt_type(fmt, i, NULL, NULL);
529  if(typeOfGT[i] == GT_UNKN)
530  {
531  noneMissing = false; // set missing as het, user should check if isNoneMissing();
532  isGenoMissing[i] = 1;
533  v[i * nploidy + 0] = 1;
534  for(j = 1; j < nploidy_cur; j++) v[i * nploidy + j] = 0;
535  continue;
536  }
537 
538  for(j = 0; j < nploidy_cur; j++)
539  {
540  // TODO: right now only parse 0 and 1, ie max(nploidy)=2; other values 2,3... will be set to 1
541  v[i * nploidy + j] = bcf_gt_allele(gts[j + i * nploidy_cur]) != 0;
542  }
543  if(nploidy == 2)
544  {
545  gtPhase[i] = (gts[1 + i * nploidy_cur] & 1) == 1;
546  nphased += gtPhase[i];
547  }
548  }
549  if(nphased == nsamples)
550  {
551  isAllPhased = true;
552  }
553  else
554  {
555  isAllPhased = false;
556  }
557  return true;
558  }
559 
567  bool getGenotypes(std::vector<int> & v)
568  {
569  ndst = 0;
570  ret = bcf_get_genotypes(header->hdr, line.get(), &gts, &ndst);
571  if(ret <= 0)
572  {
573 # if defined(VERBOSE)
574  std::cerr << "GT not present for current site. did you initilize the variant object?\n";
575 # endif
576  return false;
577  }
578  v.resize(ret);
579  isGenoMissing.assign(nsamples, 0);
580  nploidy = ret / nsamples;
581  int i, j, nphased = 0;
582  noneMissing = true;
583  for(i = 0; i < nsamples; i++)
584  {
585  int32_t * ptr = gts + i * nploidy;
586  int is_phased = 0;
587  for(j = 0; j < nploidy; j++)
588  {
589  // if true, the sample has smaller ploidy
590  if(ptr[j] == bcf_int32_vector_end) break;
591  // missing allele
592  if(bcf_gt_is_missing(ptr[j]))
593  {
594  noneMissing = false;
595  isGenoMissing[i] = 1;
596  v[i * nploidy + j] = -9;
597  continue;
598  }
599  v[i * nploidy + j] = bcf_gt_allele(ptr[j]);
600  is_phased += bcf_gt_is_phased(ptr[j]);
601  }
602  if(nploidy == is_phased)
603  {
604  gtPhase[i] = true;
605  nphased += gtPhase[i];
606  }
607  }
608  if(nphased == nsamples)
609  {
610  isAllPhased = true;
611  }
612  else
613  {
614  isAllPhased = false;
615  }
616  return true;
617  }
618 
625  template<typename T, typename S = typename T::value_type>
626  isFormatVector<T> getFORMAT(std::string tag, T & v)
627  {
628  fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str());
629  if(!fmt)
630  {
631  throw std::invalid_argument("no FORMAT=" + tag + " in the VCF header.\n");
632  }
633  nvalues = fmt->n;
634  ndst = 0;
635  S * dst = NULL;
636  int tagid = header->getFormatType(tag);
637  if(tagid == 1)
638  {
639  ret = bcf_get_format_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
640  }
641  else if(tagid == 2)
642  {
643  ret = bcf_get_format_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
644  }
645  else if(tagid == 3)
646  {
647  ret = bcf_get_format_char(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
648  }
649  else
650  {
651  ret = -1;
652  }
653 
654  if(ret >= 0)
655  {
656  // user have to check if there is missing in the return v;
657  v = std::vector<S>(dst, dst + ret);
658  free(dst);
659  return true;
660  }
661  else
662  {
663  free(dst);
664  return false;
665  }
666  }
667 
674  bool getFORMAT(std::string tag, std::vector<std::string> & v)
675  {
676  fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str());
677  if(!fmt)
678  {
679  throw std::invalid_argument("no FORMAT=" + tag + " in the VCF header.\n");
680  }
681  nvalues = fmt->n;
682  // if ndst < (fmt->n+1)*nsmpl; then realloc is involved
683  ret = -1, ndst = 0;
684  char ** dst = NULL;
685  if(header->getFormatType(tag) == 3)
686  ret = bcf_get_format_string(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
687  if(ret > 0)
688  {
689  v.clear();
690  for(int i = 0; i < nsamples; i++)
691  {
692  // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
693  v.emplace_back(dst[i]);
694  };
695  free(dst[0]);
696  free(dst);
697  return true;
698  }
699  else
700  {
701  free(dst[0]);
702  free(dst);
703  return false;
704  }
705  }
706 
713  template<typename T, typename S = typename T::value_type>
714  isInfoVector<T> getINFO(std::string tag, T & v)
715  {
716  info = bcf_get_info(header->hdr, line.get(), tag.c_str());
717  if(!info)
718  {
719  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
720  }
721  S * dst = NULL;
722  ndst = 0;
723  ret = -1;
724  if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
725  {
726  ret = bcf_get_info_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
727  }
728  else if(info->type == BCF_BT_FLOAT)
729  {
730  ret = bcf_get_info_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
731  }
732  if(ret >= 0)
733  {
734  v = std::vector<S>(dst, dst + ret); // user have to check if there is missing in the return v;
735  free(dst);
736  return true;
737  }
738  else
739  {
740  free(dst);
741  return false;
742  }
743  }
744 
751  template<typename T>
752  isScalar<T> getINFO(std::string tag, T & v)
753  {
754  info = bcf_get_info(header->hdr, line.get(), tag.c_str());
755  if(!info)
756  {
757  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
758  }
759  // scalar value
760  if(info->len == 1)
761  {
762  if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
763  {
764  v = info->v1.i;
765  }
766  else if(info->type == BCF_BT_FLOAT)
767  {
768  v = info->v1.f;
769  }
770  return true;
771  }
772  else
773  {
774 # if defined(VERBOSE)
775  std::cerr << "there are multiple values for " + tag
776  + " in INFO for current site. please use vector instead\n";
777 # endif
778  return false;
779  }
780  }
781 
788  template<typename T>
789  isString<T> getINFO(std::string tag, T & v)
790  {
791  info = bcf_get_info(header->hdr, line.get(), tag.c_str());
792  if(!info)
793  {
794  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
795  }
796  if(info->type == BCF_BT_CHAR)
797  {
798  v = std::string(reinterpret_cast<char *>(info->vptr), info->vptr_len);
799  return true;
800  }
801  else
802  {
803  return false;
804  }
805  }
806 
813  template<typename T>
814  isScalar<T> setINFO(std::string tag, const T & v)
815  {
816  // bcf_hrec_set_val
817  // bcf_update_info_flag(header.hdr, line, tag.c_str(), NULL, 1);
818  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
819  if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
820  {
821  ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), &v, 1);
822  }
823  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
824  {
825  float v2 = static_cast<float>(v);
826  ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), &v2, 1);
827  }
828  else
829  {
830  ret = -1;
831  }
832  if(ret < 0)
833  {
834 # if defined(VERBOSE)
835  std::cerr << "couldn't set " + tag + " for this variant.\nplease add the tag in headerfirst.\n";
836 # endif
837  return false;
838  }
839  return true;
840  }
841 
848  template<typename T>
849  isValidInfo<T> setINFO(std::string tag, const T & v)
850  {
851  // bcf_update_info_flag(header.hdr, line, tag.c_str(), NULL, 1);
852  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
853  if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
854  {
855  ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
856  }
857  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
858  {
859  ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
860  }
861  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
862  {
863  ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), v.data());
864  }
865  else
866  {
867  ret = -1;
868  }
869 
870  if(ret < 0)
871  {
872 # if defined(VERBOSE)
873  std::cerr << "couldn't set " + tag + " for this variant.\nplease add the tag in headerfirst.\n";
874 # endif
875  return false;
876  }
877  return true;
878  }
879 
881  void removeINFO(std::string tag)
882  {
883  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
884  if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
885  {
886  ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), NULL, 0);
887  }
888  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
889  {
890  ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), NULL, 0);
891  }
892  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
893  {
894  ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), NULL);
895  }
896  else
897  {
898  ret = -1;
899  }
900 
901  if(ret < 0)
902  {
903  throw std::runtime_error("couldn't remove " + tag + " for this variant.\n");
904  }
905  }
906 
912  bool setGenotypes(const std::vector<int> & v)
913  {
914  // bcf_gt_type
915  int i, j, k;
916  nploidy = v.size() / nsamples;
917  int32_t * gt = (int32_t *)malloc(v.size() * sizeof(int32_t));
918  for(i = 0; i < nsamples; i++)
919  {
920  for(j = 0; j < nploidy; j++)
921  {
922  k = i * nploidy + j;
923  if(v[k] == -9 || v[k] == bcf_int32_missing)
924  {
925  gt[k] = bcf_gt_missing;
926  }
927  else if(gtPhase[i])
928  {
929  gt[k] = bcf_gt_phased(v[k]);
930  }
931  else
932  {
933  gt[k] = bcf_gt_unphased(v[k]);
934  }
935  }
936  }
937  if(bcf_update_genotypes(header->hdr, line.get(), gt, v.size()) < 0)
938  {
939  free(gt);
940 # if defined(VERBOSE)
941  std::cerr << "couldn't set genotypes correctly.\n";
942 # endif
943  return false;
944  }
945  free(gt);
946  return true;
947  }
948 
953  void setPhasing(const std::vector<char> & v)
954  {
955  assert((int)v.size() == nsamples);
956  gtPhase = v;
957  }
958 
960  void removeFORMAT(std::string tag)
961  {
962  ret = -1;
963  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
964  if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
965  {
966  ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), NULL, 0);
967  }
968  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
969  {
970  ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), NULL, 0);
971  }
972  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
973  {
974  ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), NULL, 0);
975  }
976 
977  if(ret < 0) throw std::runtime_error("couldn't remove " + tag + " correctly.\n");
978  }
979 
986  template<typename T>
987  isValidFMT<T> setFORMAT(std::string tag, const T & v)
988  {
989  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
990  if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
991  {
992  ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
993  }
994  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
995  {
996  ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
997  }
998  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
999  {
1000  ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
1001  }
1002  else
1003  {
1004  ret = -1;
1005  }
1006 
1007  if(ret < 0)
1008  {
1009 # if defined(VERBOSE)
1010  std::cerr << "couldn't set format " + tag + " corectly.\n";
1011 # endif
1012  return false;
1013  }
1014  return true;
1015  }
1016 
1024  template<typename T>
1025  isScalar<T> setFORMAT(std::string tag, const T & v)
1026  {
1027  float v2 = v;
1028  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
1029  if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1030  {
1031  ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), &v, 1);
1032  }
1033  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1034  {
1035  ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), &v2, 1);
1036  }
1037  else
1038  {
1039  ret = -1;
1040  }
1041  if(ret < 0)
1042  {
1043 # if defined(VERBOSE)
1044  std::cerr << "couldn't set format " + tag + " corectly.\n";
1045 # endif
1046  return false;
1047  }
1048  return true;
1049  }
1050 
1052  void addLineFromString(const std::string & vcfline)
1053  {
1054  kstring_t s = {0, 0, NULL};
1055  kputsn(vcfline.c_str(), vcfline.length(), &s);
1056  ret = vcf_parse1(&s, header->hdr, line.get());
1057  free(s.s);
1058  if(ret > 0) throw std::runtime_error("error parsing: " + vcfline + "\n");
1059  if(line->errcode == BCF_ERR_CTG_UNDEF)
1060  {
1061  std::string contig(bcf_hdr_id2name(header->hdr, line->rid));
1062  hdr_d = bcf_hdr_dup(header->hdr);
1063  header->hrec = bcf_hdr_id2hrec(hdr_d, BCF_DT_CTG, 0, line->rid);
1064  if(header->hrec == NULL)
1065  throw std::runtime_error("contig" + contig + " unknow and not found in the header.\n");
1066  ret = bcf_hdr_add_hrec(header->hdr, header->hrec);
1067  if(ret < 0) throw std::runtime_error("error adding contig " + contig + " to header.\n");
1068  ret = bcf_hdr_sync(header->hdr);
1069  }
1070  }
1071 
1073  inline bool isNoneMissing() const
1074  {
1075  return noneMissing;
1076  }
1077 
1079  inline bool isSV() const
1080  {
1081  if(bcf_get_info(header->hdr, line.get(), "SVTYPE") == NULL)
1082  {
1083  return false;
1084  }
1085  else
1086  {
1087  return true;
1088  }
1089  }
1090 
1092  inline bool isIndel() const
1093  {
1094  // REF has multiple allels
1095  if(REF().length() > 1 && !isSV()) return true;
1096  for(int i = 1; i < line->n_allele; i++)
1097  {
1098  std::string alt(line->d.allele[i]);
1099  if(alt == ".") return true;
1100  if(alt.length() != REF().length() && !isSV()) return true;
1101  }
1102  return false;
1103  }
1104 
1106  inline bool isMultiAllelics() const
1107  {
1108  if(line->n_allele <= 2) return false;
1109  return true;
1110  }
1111 
1113  inline bool isMultiAllelicSNP() const
1114  {
1115  // skip REF with length > 1, i.e. INDEL
1116  if(REF().length() > 1 || line->n_allele <= 2) return false;
1117  for(int i = 1; i < line->n_allele; i++)
1118  {
1119  std::string snp(line->d.allele[i]);
1120  if(snp.length() != 1)
1121  {
1122  return false;
1123  }
1124  }
1125  return true;
1126  }
1127 
1130  inline bool isSNP() const
1131  {
1132  // REF and ALT have multiple allels
1133  if(REF().length() > 1 || line->n_allele > 2) return false;
1134  std::string snp(line->d.allele[1]);
1135  if(!(snp == "A" || snp == "C" || snp == "G" || snp == "T"))
1136  {
1137  return false;
1138  }
1139  return true;
1140  }
1141 
1144  inline bool hasSNP() const
1145  {
1146  int type = bcf_has_variant_types(line.get(), VCF_SNP, bcf_match_overlap);
1147  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1148  if(type == 0) return false;
1149  return true;
1150  }
1151 
1153  inline bool hasINDEL() const
1154  {
1155  int type = bcf_has_variant_types(line.get(), VCF_INDEL, bcf_match_overlap);
1156  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1157  if(type == 0) return false;
1158  return true;
1159  }
1160 
1162  inline bool hasINS() const
1163  {
1164  int type = bcf_has_variant_types(line.get(), VCF_INS, bcf_match_overlap);
1165  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1166  if(type == 0) return false;
1167  return true;
1168  }
1169 
1171  inline bool hasDEL() const
1172  {
1173  int type = bcf_has_variant_types(line.get(), VCF_DEL, bcf_match_overlap);
1174  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1175  if(type == 0) return false;
1176  return true;
1177  }
1178 
1180  inline bool hasMNP() const
1181  {
1182  int type = bcf_has_variant_types(line.get(), VCF_MNP, bcf_match_overlap);
1183  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1184  if(type == 0) return false;
1185  return true;
1186  }
1187 
1189  inline bool hasBND() const
1190  {
1191  int type = bcf_has_variant_types(line.get(), VCF_BND, bcf_match_overlap);
1192  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1193  if(type == 0) return false;
1194  return true;
1195  }
1196 
1198  inline bool hasOTHER() const
1199  {
1200  int type = bcf_has_variant_types(line.get(), VCF_OTHER, bcf_match_overlap);
1201  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1202  if(type == 0) return false;
1203  return true;
1204  }
1205 
1207  inline bool hasOVERLAP() const
1208  {
1209  int type = bcf_has_variant_types(line.get(), VCF_OVERLAP, bcf_match_overlap);
1210  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1211  if(type == 0) return false;
1212  return true;
1213  }
1214 
1216  inline std::string CHROM() const
1217  {
1218  return std::string(bcf_hdr_id2name(header->hdr, line->rid));
1219  }
1220 
1222  inline std::string ID() const
1223  {
1224  return std::string(line->d.id);
1225  }
1226 
1228  inline int64_t POS() const
1229  {
1230  return line->pos + 1;
1231  }
1232 
1234  inline void setCHR(const char * chr)
1235  {
1236  line->rid = bcf_hdr_name2id(header->hdr, chr);
1237  }
1238 
1240  inline void setPOS(int64_t p)
1241  {
1242  line->pos = p - 1;
1243  }
1244 
1246  inline void setID(const char * s)
1247  {
1248  bcf_update_id(header->hdr, line.get(), s);
1249  }
1250 
1252  inline void setRefAlt(const char * alleles_string)
1253  {
1254  bcf_update_alleles_str(header->hdr, line.get(), alleles_string);
1255  }
1256 
1258  inline int64_t Start() const
1259  {
1260  return line->pos;
1261  }
1262 
1264  inline int64_t End() const
1265  {
1266  return line->pos + line->rlen;
1267  }
1268 
1270  inline std::string REF() const
1271  {
1272  return std::string(line->d.allele[0]);
1273  }
1274 
1276  inline void swap_REF_ALT()
1277  {
1278  if(!isMultiAllelicSNP()) std::swap(line->d.allele[0], line->d.allele[1]);
1279  }
1280 
1282  inline std::string ALT() const
1283  {
1284  std::string s;
1285  for(int i = 1; i < line->n_allele; i++)
1286  {
1287  s += std::string(line->d.allele[i]) + ",";
1288  }
1289  if(s.length() > 1) s.pop_back();
1290  return s;
1291  }
1292 
1294  inline float QUAL()
1295  {
1296  if(bcf_float_is_missing(line->qual))
1297  {
1298  noneMissing = false;
1299  return bcf_float_missing;
1300  }
1301  else
1302  {
1303  return line->qual;
1304  }
1305  }
1306 
1308  inline std::string FILTER()
1309  {
1310  if(line->d.n_flt == 0)
1311  {
1312  return ".";
1313  }
1314  else if(line->d.n_flt == 1)
1315  {
1316  return std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[0]));
1317  }
1318  else
1319  {
1320  std::string s;
1321  for(int i = 1; i < line->d.n_flt; i++)
1322  {
1323  s += std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[i])) + ",";
1324  }
1325  s.pop_back();
1326  return s;
1327  }
1328  }
1329 
1331  inline std::string allINFO()
1332  {
1333  int32_t max_dt_id = header->hdr->n[BCF_DT_ID];
1334  kstring_t * s = (kstring_t *)calloc(1, sizeof(kstring_t));
1335  if(line->n_info)
1336  {
1337  int first = 1;
1338  for(int i = 0; i < line->n_info; ++i)
1339  {
1340  bcf_info_t * z = &line->d.info[i];
1341  if(!z->vptr) continue;
1342  if(!first) kputc(';', s);
1343  first = 0;
1344  if(z->key < 0 || z->key >= max_dt_id || header->hdr->id[BCF_DT_ID][z->key].key == NULL)
1345  throw std::runtime_error("Invalid BCF and wrong INFO tag");
1346  kputs(header->hdr->id[BCF_DT_ID][z->key].key, s);
1347  if(z->len <= 0) continue;
1348  kputc('=', s);
1349  if(z->len == 1)
1350  {
1351  switch(z->type)
1352  {
1353  case BCF_BT_INT8:
1354  if(z->v1.i == bcf_int8_missing)
1355  kputc('.', s);
1356  else
1357  kputw(z->v1.i, s);
1358  break;
1359  case BCF_BT_INT16:
1360  if(z->v1.i == bcf_int16_missing)
1361  kputc('.', s);
1362  else
1363  kputw(z->v1.i, s);
1364  break;
1365  case BCF_BT_INT32:
1366  if(z->v1.i == bcf_int32_missing)
1367  kputc('.', s);
1368  else
1369  kputw(z->v1.i, s);
1370  break;
1371  case BCF_BT_INT64:
1372  if(z->v1.i == bcf_int64_missing)
1373  kputc('.', s);
1374  else
1375  kputll(z->v1.i, s);
1376  break;
1377  case BCF_BT_FLOAT:
1378  if(bcf_float_is_missing(z->v1.f))
1379  kputc('.', s);
1380  else
1381  kputd(z->v1.f, s);
1382  break;
1383  case BCF_BT_CHAR:
1384  kputc(z->v1.i, s);
1385  break;
1386  default:
1387  throw std::runtime_error("Unexpected type in INFO");
1388  }
1389  }
1390  else
1391  bcf_fmt_array(s, z->len, z->type, z->vptr);
1392  }
1393  if(first) kputc('.', s);
1394  }
1395  else
1396  kputc('.', s);
1397  std::string out = std::string(s->s, s->l);
1398  free(s->s);
1399  free(s);
1400  return out;
1401  }
1402 
1404  inline bool allPhased() const
1405  {
1406  return isAllPhased;
1407  }
1408 
1410  inline int ploidy() const
1411  {
1412  return nploidy;
1413  }
1414 
1416  inline void setPloidy(int v)
1417  {
1418  nploidy = v;
1419  }
1420 
1432  std::vector<char> typeOfGT;
1433 
1435  std::vector<char> gtPhase;
1436 };
1437 
1443 {
1444  private:
1445  std::shared_ptr<htsFile> fp; // hts file
1446  hts_idx_t * hidx = NULL; // hts index file
1447  tbx_t * tidx = NULL; // .tbi .csi index file for vcf files
1448  hts_itr_t * itr = NULL; // tabix iterator
1449  kstring_t s = {0, 0, NULL}; // kstring
1450  std::string fname;
1451  bool isBcf; // if the input file is bcf or vcf;
1452 
1453  public:
1459  std::vector<std::string> SamplesName;
1460 
1463 
1468  BcfReader(const std::string & file) : fname(file)
1469  {
1470  open(file);
1471  }
1472 
1478  BcfReader(const std::string & file, const std::string & region) : fname(file)
1479  {
1480  open(file);
1481  if(!region.empty()) setRegion(region);
1483  }
1484 
1495  BcfReader(const std::string & file, const std::string & region, const std::string & samples) : fname(file)
1496  {
1497  open(file);
1498  if(!region.empty()) setRegion(region);
1499  if(!samples.empty()) setSamples(samples);
1500  }
1501 
1502  ~BcfReader()
1503  {
1504  close();
1505  }
1506 
1508  void close()
1509  {
1510  if(fp) fp.reset();
1511  if(itr) hts_itr_destroy(itr);
1512  if(hidx) hts_idx_destroy(hidx);
1513  if(tidx) tbx_destroy(tidx);
1514  if(s.s) free(s.s);
1515  }
1516 
1518  void open(const std::string & file)
1519  {
1520  fname = file;
1521  fp = std::shared_ptr<htsFile>(hts_open(file.c_str(), "r"), htsFile_close());
1522  header.hdr = bcf_hdr_read(fp.get());
1523  nsamples = bcf_hdr_nsamples(header.hdr);
1525  }
1526 
1528  inline int setThreads(int n)
1529  {
1530  return hts_set_threads(fp.get(), n);
1531  }
1532 
1534  const BcfHeader & getHeader() const
1535  {
1536  return header;
1537  }
1538 
1540  uint64_t getVariantsCount(BcfRecord & r, const std::string & region)
1541  {
1542  uint64_t c{0};
1543  while(getNextVariant(r)) c++;
1544  setRegion(region); // reset the region
1545  return c;
1546  }
1547 
1553  void setSamples(const std::string & samples)
1554  {
1555  header.setSamples(samples);
1556  nsamples = bcf_hdr_nsamples(header.hdr);
1558  }
1559 
1564  void setRegion(const std::string & region)
1565  {
1566  // 1. check and load index first
1567  // 2. query iterval region
1568  // 3. if region is empty, use "."
1569  if(isEndWith(fname, "bcf") || isEndWith(fname, "bcf.gz"))
1570  {
1571  isBcf = true;
1572  hidx = bcf_index_load(fname.c_str());
1573  if(itr) bcf_itr_destroy(itr); // reset current region.
1574  if(region.empty())
1575  itr = bcf_itr_querys(hidx, header.hdr, ".");
1576  else
1577  itr = bcf_itr_querys(hidx, header.hdr, region.c_str());
1578  }
1579  else
1580  {
1581  isBcf = false;
1582  tidx = tbx_index_load(fname.c_str());
1583  assert(tidx != NULL && "error loading tabix index!");
1584  if(itr) tbx_itr_destroy(itr); // reset current region.
1585  if(region.empty())
1586  itr = tbx_itr_querys(tidx, ".");
1587  else
1588  itr = tbx_itr_querys(tidx, region.c_str());
1589  assert(itr != NULL && "no interval region found.failed!");
1590  }
1591  }
1592 
1596  {
1597  int ret = -1;
1598  if(itr != NULL)
1599  {
1600  if(isBcf)
1601  {
1602  ret = bcf_itr_next(fp.get(), itr, r.line.get());
1603  bcf_unpack(r.line.get(), BCF_UN_ALL);
1604  return (ret >= 0);
1605  }
1606  else
1607  {
1608  int slen = tbx_itr_next(fp.get(), tidx, itr, &s);
1609  if(slen > 0)
1610  {
1611  ret = vcf_parse1(&s, r.header->hdr, r.line.get()); // ret > 0, error
1612  bcf_unpack(r.line.get(), BCF_UN_ALL);
1613  }
1614  return (ret <= 0) && (slen > 0);
1615  }
1616  }
1617  else
1618  {
1619  ret = bcf_read(fp.get(), r.header->hdr, r.line.get());
1620  // unpack record immediately. not lazy
1621  bcf_unpack(r.line.get(), BCF_UN_ALL);
1622  return (ret == 0);
1623  }
1624  }
1625 };
1626 
1632 {
1633  private:
1634  std::shared_ptr<htsFile> fp; // hts file
1635  std::shared_ptr<bcf1_t> b = std::shared_ptr<bcf1_t>(bcf_init(), variant_close()); // variant
1636  int ret;
1637  bool isHeaderWritten = false;
1638  const BcfHeader * hp;
1639 
1640  public:
1643 
1646 
1652  BcfWriter(const std::string & fname, std::string version = "VCF4.1")
1653  {
1654  open(fname);
1655  initalHeader(version);
1657  }
1658 
1664  BcfWriter(const std::string & fname, const BcfHeader & h)
1665  {
1666  open(fname);
1667  initalHeader(h);
1668  }
1669 
1680  BcfWriter(const std::string & fname, const std::string & version, const std::string & mode)
1681  {
1682  open(fname, mode);
1683  initalHeader(version);
1685  }
1686 
1697  BcfWriter(const std::string & fname, const BcfHeader & h, const std::string & mode)
1698  {
1699  open(fname, mode);
1700  initalHeader(h);
1701  }
1702 
1703  ~BcfWriter() {}
1704 
1709  void open(const std::string & fname)
1710  {
1711  auto mode = getMode(fname, "w");
1712  fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), htsFile_close());
1713  }
1714 
1724  void open(const std::string & fname, const std::string & mode)
1725  {
1726  fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), htsFile_close());
1727  }
1728 
1730  void close()
1731  {
1732  if(!isHeaderWritten) writeHeader();
1733  if(b) b.reset();
1734  if(fp) fp.reset();
1735  }
1736 
1738  void initalHeader(std::string version = "VCF4.1")
1739  {
1740  header.hdr = bcf_hdr_init("w");
1741  header.setVersion(version);
1742  }
1743 
1745  void initalHeader(const BcfHeader & h)
1746  {
1747  hp = &h;
1748  }
1749 
1751  void copyHeader(const std::string & vcffile)
1752  {
1753  htsFile * fp2 = hts_open(vcffile.c_str(), "r");
1754  header.hdr = bcf_hdr_read(fp2);
1755  hts_close(fp2);
1757  }
1758 
1760  void writeLine(const std::string & vcfline)
1761  {
1762  if(!isHeaderWritten && !writeHeader()) throw std::runtime_error("could not write header\n");
1763  kstring_t s = {0, 0, NULL};
1764  kputsn(vcfline.c_str(), vcfline.length(), &s);
1765  ret = vcf_parse1(&s, hp->hdr, b.get());
1766  free(s.s);
1767  if(ret > 0) throw std::runtime_error("error parsing: " + vcfline + "\n");
1768  if(b->errcode == BCF_ERR_CTG_UNDEF)
1769  {
1770  throw std::runtime_error("contig id " + (std::string)bcf_hdr_id2name(hp->hdr, b->rid)
1771  + " not found in the header. please run header->AddContig() first.\n");
1772  }
1773  ret = bcf_write(fp.get(), hp->hdr, b.get());
1774  if(ret != 0) throw std::runtime_error("error writing: " + vcfline + "\n");
1775  }
1776 
1779  {
1780  ret = bcf_hdr_write(fp.get(), hp->hdr);
1781  if(ret == 0) return isHeaderWritten = true;
1782  return false;
1783  }
1784 
1786  inline bool writeRecord(BcfRecord & v)
1787  {
1788  if(!isHeaderWritten) writeHeader();
1789  if(bcf_write(fp.get(), v.header->hdr, v.line.get()) < 0) return false;
1790  return true;
1791  }
1792 };
1793 
1794 } // namespace vcfpp
1795 
1796 #endif // VCFPP_H_
Object represents header of the VCF, offering methods to access and modify the tags.
Definition: vcfpp.h:184
std::string asString() const
return header as a string
Definition: vcfpp.h:282
void removeFormat(std::string tag) const
remove a FORMAT tag from header
Definition: vcfpp.h:363
void addContig(const std::string &id)
add contig to header
Definition: vcfpp.h:254
void addFILTER(const std::string &id, const std::string &description)
add one FILTER line to header
Definition: vcfpp.h:246
int getFormatType(std::string tag) const
get the type of a given tag
Definition: vcfpp.h:328
void addSample(const std::string &sample) const
add one sample name to header
Definition: vcfpp.h:272
void addFORMAT(const std::string &id, const std::string &number, const std::string &type, const std::string &description)
add FORMAT field to header
Definition: vcfpp.h:232
void setVersion(const std::string &version) const
set the VCF version
Definition: vcfpp.h:390
void removeInfo(std::string tag) const
remove a INFO tag from header
Definition: vcfpp.h:357
void setSamples(const std::string &samples) const
explicitly set samples to be extracted
Definition: vcfpp.h:378
void addLine(const std::string &str)
add one line to header
Definition: vcfpp.h:262
std::vector< std::string > getSeqnames() const
return all contig/chromosome names in a string vector
Definition: vcfpp.h:309
void removeContig(std::string tag) const
remove a contig tag from header
Definition: vcfpp.h:351
std::vector< std::string > getSamples() const
return all samples in a string vector
Definition: vcfpp.h:298
int nSamples() const
return the number of samples in the VCF
Definition: vcfpp.h:396
friend std::ostream & operator<<(std::ostream &out, const BcfHeader &h)
stream out the header
Definition: vcfpp.h:203
void removeFilter(std::string tag) const
remove a FILTER tag from header
Definition: vcfpp.h:369
void addINFO(const std::string &id, const std::string &number, const std::string &type, const std::string &description)
add INFO field to header
Definition: vcfpp.h:217
Stream in variants from compressed/uncompressed VCF/BCF file or stdin.
Definition: vcfpp.h:1443
void setSamples(const std::string &samples)
explicitly stream to specific samples
Definition: vcfpp.h:1553
void open(const std::string &file)
open a VCF/BCF/STDIN file for streaming in
Definition: vcfpp.h:1518
BcfReader(const std::string &file, const std::string &region, const std::string &samples)
construct a vcf/bcf reader with subset samples in target region
Definition: vcfpp.h:1495
BcfHeader header
a BcfHeader object
Definition: vcfpp.h:1455
int setThreads(int n)
set the number of threads to use
Definition: vcfpp.h:1528
uint64_t getVariantsCount(BcfRecord &r, const std::string &region)
get the number of records of given region
Definition: vcfpp.h:1540
BcfReader(const std::string &file, const std::string &region)
construct a vcf/bcf reader with subset samples
Definition: vcfpp.h:1478
bool getNextVariant(BcfRecord &r)
read in the next variant
Definition: vcfpp.h:1595
BcfReader()
Construct an empty BcfReader.
Definition: vcfpp.h:1462
int nsamples
number of samples in the VCF
Definition: vcfpp.h:1457
const BcfHeader & getHeader() const
return a BcfHeader object
Definition: vcfpp.h:1534
BcfReader(const std::string &file)
construct a vcf/bcf reader from file.
Definition: vcfpp.h:1468
void close()
close the BcfReader object.
Definition: vcfpp.h:1508
void setRegion(const std::string &region)
explicitly stream to specific region
Definition: vcfpp.h:1564
std::vector< std::string > SamplesName
number of samples in the VCF
Definition: vcfpp.h:1459
Object represents a variant record in the VCF, offering methods to access and modify fields.
Definition: vcfpp.h:409
float QUAL()
return QUAL value
Definition: vcfpp.h:1294
bool isIndel() const
return boolean value indicates if current variant is exclusively INDEL
Definition: vcfpp.h:1092
bool hasINDEL() const
return boolean value indicates if current variant has INDEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1153
void setID(const char *s)
update ID
Definition: vcfpp.h:1246
isValidGT< T > getGenotypes(T &v)
fill in the input vector with genotypes of 0 and 1. only works for ploidy<=2. Genotypes with missing ...
Definition: vcfpp.h:494
void setPOS(int64_t p)
modify position given 1-based value
Definition: vcfpp.h:1240
bool hasINS() const
return boolean value indicates if current variant has INS type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1162
BcfRecord()
empty constructor. call init() afterwards
Definition: vcfpp.h:432
std::string allINFO()
return raw INFO column as string. recommend to use getINFO for specific tag.
Definition: vcfpp.h:1331
int ploidy() const
return the number of ploidy of current variant
Definition: vcfpp.h:1410
void removeINFO(std::string tag)
remove the given tag from INFO of the variant
Definition: vcfpp.h:881
BcfRecord(BcfHeader &h)
constructor with a given BcfHeader object
Definition: vcfpp.h:435
std::string CHROM() const
return CHROM name
Definition: vcfpp.h:1216
isValidFMT< T > setFORMAT(std::string tag, const T &v)
set tag values for all samples in FORMAT using given vector
Definition: vcfpp.h:987
int64_t POS() const
return 1-base position
Definition: vcfpp.h:1228
std::string ID() const
return ID field
Definition: vcfpp.h:1222
isScalar< T > setFORMAT(std::string tag, const T &v)
set tag for a single sample in FORMAT using given singular value. this works only when there is one s...
Definition: vcfpp.h:1025
bool isSV() const
return boolean value indicates if current variant is Structual Variant or not
Definition: vcfpp.h:1079
bool hasSNP() const
return boolean value indicates if current variant has SNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1144
std::vector< char > isGenoMissing
if there is "." in GT for the sample, then it's coded as missing (TRUE)
Definition: vcfpp.h:428
std::string FILTER()
return raw FILTER column as string
Definition: vcfpp.h:1308
void swap_REF_ALT()
swap REF and ALT for biallelic SNP
Definition: vcfpp.h:1276
std::string REF() const
return raw REF alleles as string
Definition: vcfpp.h:1270
std::string asString() const
return current variant as raw string
Definition: vcfpp.h:470
int64_t Start() const
return 0-base start of the variant (can be any type)
Definition: vcfpp.h:1258
bool hasBND() const
return boolean value indicates if current variant has BND type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1189
friend std::ostream & operator<<(std::ostream &out, const BcfRecord &v)
stream out the variant
Definition: vcfpp.h:463
isScalar< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:814
void resetHeader(BcfHeader &h)
reset the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:457
bool hasMNP() const
return boolean value indicates if current variant has MNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1180
bool allPhased() const
return boolean value indicates if genotypes of all samples are phased
Definition: vcfpp.h:1404
bool hasDEL() const
return boolean value indicates if current variant has DEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1171
bool isMultiAllelicSNP() const
return boolean value indicates if current variant is exclusively multiallelic SNP sites
Definition: vcfpp.h:1113
void setPhasing(const std::vector< char > &v)
set phasing status for all diploid samples using given vector
Definition: vcfpp.h:953
bool setGenotypes(const std::vector< int > &v)
set genotypes from scratch even if genotypes not present
Definition: vcfpp.h:912
std::string ALT() const
return raw ALT alleles as string
Definition: vcfpp.h:1282
void setCHR(const char *chr)
modify CHROM value
Definition: vcfpp.h:1234
bool hasOTHER() const
return boolean value indicates if current variant has OTHER type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1198
isString< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:789
bool isSNP() const
return boolean value indicates if current variant is exclusively biallelic SNP. Note ALT=* are skippe...
Definition: vcfpp.h:1130
bool isMultiAllelics() const
return boolean value indicates if current variant is multiallelic sites
Definition: vcfpp.h:1106
int64_t End() const
return 0-base end of the variant (can be any type)
Definition: vcfpp.h:1264
void setRefAlt(const char *alleles_string)
set REF and ALT alleles given a string seperated by comma
Definition: vcfpp.h:1252
void setPloidy(int v)
in a rare case, one may want to set the number of ploidy manually
Definition: vcfpp.h:1416
isFormatVector< T > getFORMAT(std::string tag, T &v)
get tag value in FORMAT
Definition: vcfpp.h:626
void initHeader(BcfHeader &h)
initilize the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:447
void addLineFromString(const std::string &vcfline)
add one variant record from given string
Definition: vcfpp.h:1052
bool getFORMAT(std::string tag, std::vector< std::string > &v)
get tag value in FORMAT
Definition: vcfpp.h:674
bool isNoneMissing() const
if all samples have non missing values for the tag in FORMAT
Definition: vcfpp.h:1073
isValidInfo< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:849
std::vector< char > gtPhase
vector of nsamples length. keep track of the phasing status of each sample
Definition: vcfpp.h:1435
void removeFORMAT(std::string tag)
remove the given tag from FORMAT of the variant
Definition: vcfpp.h:960
std::vector< char > typeOfGT
vector of nsamples length. keep track of the type of genotype (one of GT_HOM_RR, GT_HET_RA,...
Definition: vcfpp.h:1432
isInfoVector< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:714
bool getGenotypes(std::vector< int > &v)
fill in the input vector with genotype values, 0, 1 or -9 (missing).
Definition: vcfpp.h:567
bool hasOVERLAP() const
return boolean value indicates if current variant has OVERLAP type defined in vcf....
Definition: vcfpp.h:1207
isScalar< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:752
Stream out variants to compressed/uncompressed VCF/BCF file or stdout.
Definition: vcfpp.h:1632
void close()
close the BcfWriter object.
Definition: vcfpp.h:1730
void writeLine(const std::string &vcfline)
copy a string to a vcf line
Definition: vcfpp.h:1760
BcfWriter(const std::string &fname, const std::string &version, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1680
BcfWriter(const std::string &fname, std::string version="VCF4.1")
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1652
void copyHeader(const std::string &vcffile)
copy header of given VCF
Definition: vcfpp.h:1751
void open(const std::string &fname)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1709
BcfWriter(const std::string &fname, const BcfHeader &h)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1664
void initalHeader(const BcfHeader &h)
initial a VCF header by pointing to header of another VCF
Definition: vcfpp.h:1745
BcfWriter(const std::string &fname, const BcfHeader &h, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1697
void initalHeader(std::string version="VCF4.1")
initial a VCF header using the internal template given a specific version. VCF4.1 is the default
Definition: vcfpp.h:1738
BcfHeader header
header object initialized by initalHeader
Definition: vcfpp.h:1642
BcfWriter()
Construct an empty BcfWriter.
Definition: vcfpp.h:1645
void open(const std::string &fname, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1724
bool writeRecord(BcfRecord &v)
streams out the given variant of BcfRecord type
Definition: vcfpp.h:1786
bool writeHeader()
streams out the header
Definition: vcfpp.h:1778
Definition: vcfpp.h:171
Definition: vcfpp.h:153
Definition: vcfpp.h:162