vcfpp
All Classes Functions Variables Friends Pages
vcfpp.h
1 /*******************************************************************************
2  * @file https://github.com/Zilong-Li/vcfpp/vcfpp.h
3  * @author Zilong Li
4  * @email zilong.dk@gmail.com
5  * @version v0.7.2
6  * @breif a single C++ file for manipulating VCF
7  * Copyright (C) 2022-2025.The use of this code is governed by the LICENSE file.
8  ******************************************************************************/
9 
36 #pragma once
37 
38 #include <cstddef>
39 #include <stdexcept>
40 #ifndef VCFPP_H_
41 # define VCFPP_H_
42 
43 # include <iostream>
44 # include <memory>
45 # include <string>
46 # include <type_traits>
47 # include <vector>
48 
49 // make sure you have htslib installed
50 extern "C"
51 {
52 # include <htslib/hts.h>
53 # include <htslib/kstring.h>
54 # include <htslib/tbx.h>
55 # include <htslib/vcf.h>
56 # include <htslib/vcfutils.h>
57 }
58 
59 namespace vcfpp
60 {
61 template<typename T>
62 using isValidFMT =
63  typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<char>>::value
64  || std::is_same<T, std::vector<int>>::value
65  || std::is_same<T, std::vector<float>>::value,
66  bool>::type;
67 
68 template<typename T>
69 using isValidInfo =
70  typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<int>>::value
71  || std::is_same<T, std::vector<float>>::value,
72  bool>::type;
73 
74 template<typename T>
75 using isInfoVector = typename std::enable_if<std::is_same<T, std::vector<int>>::value
76  || std::is_same<T, std::vector<float>>::value,
77  bool>::type;
78 
79 template<typename T>
80 using isScalar = typename std::enable_if<std::is_same<T, int>::value || std::is_same<T, float>::value
81  || std::is_same<T, double>::value,
82  bool>::type;
83 
84 template<typename T>
85 using isString = typename std::enable_if<std::is_same<T, std::string>::value, bool>::type;
86 
87 template<typename T>
88 using isValidGT = typename std::enable_if<std::is_same<T, std::vector<bool>>::value
89  || std::is_same<T, std::vector<char>>::value,
90  bool>::type;
91 
92 template<typename T>
93 using isFormatVector = typename std::enable_if<std::is_same<T, std::vector<float>>::value
94  || std::is_same<T, std::vector<char>>::value
95  || std::is_same<T, std::vector<int>>::value,
96  bool>::type;
97 
98 namespace details
99 {
100 
101 template<typename T>
102 isScalar<T> isMissing(T const & v)
103 {
104  if(std::is_same<T, float>::value)
105  {
106  return bcf_float_is_missing(v);
107  }
108  else if(std::is_same<T, int>::value)
109  {
110  return bcf_int32_missing(v);
111  }
112 }
113 
114 // test if a string is end with another string
115 inline bool isEndWith(std::string const & s, std::string const & e)
116 {
117  if(s.length() >= e.length())
118  {
119  return (0 == s.compare(s.length() - e.length(), e.length(), e));
120  }
121  else
122  {
123  return false;
124  }
125 }
126 
127 // determinate the mode for read/write the compressed/uncompressed VCF/BCF
128 inline std::string getMode(std::string const & fname, std::string mode = "r")
129 {
130  if(isEndWith(fname, "bcf.gz")) mode += "b";
131  if(isEndWith(fname, "bcf")) mode += "bu";
132  if(isEndWith(fname, "vcf.gz")) mode += "z";
133  return mode;
134 }
135 
136 // string split by separator
137 inline std::vector<std::string> split_string(const std::string & s, const std::string & separators)
138 {
139  std::vector<std::string> ret;
140  bool is_seperator[256] = {false};
141  for(auto & ch : separators)
142  {
143  is_seperator[(unsigned int)ch] = true;
144  }
145  int begin = 0;
146  for(int i = 0; i <= (int)s.size(); i++)
147  {
148  if(is_seperator[(uint8_t)s[i]] || i == (int)s.size())
149  {
150  ret.emplace_back(s.begin() + begin, s.begin() + i);
151  begin = i + 1;
152  }
153  }
154  return ret;
155 }
156 
157 // deleter for htsFile
159 {
160  void operator()(htsFile * x)
161  {
162  if(x) hts_close(x);
163  }
164 };
165 
166 // deleter for hts iterator
168 {
169  void operator()(hts_itr_t * x)
170  {
171  if(x) hts_itr_destroy(x);
172  }
173 };
174 
175 // deleter for hts idx
177 {
178  void operator()(hts_idx_t * x)
179  {
180  if(x) hts_idx_destroy(x);
181  }
182 };
183 
184 // deleter for tabix idx
186 {
187  void operator()(tbx_t * x)
188  {
189  if(x) tbx_destroy(x);
190  }
191 };
192 
193 // deleter for variant
195 {
196  void operator()(bcf1_t * x)
197  {
198  if(x) bcf_destroy(x);
199  }
200 };
201 
202 // deleter for bcf header
204 {
205  void operator()(bcf_hdr_t * x)
206  {
207  if(x) bcf_hdr_destroy(x);
208  }
209 };
210 
211 } // namespace details
212 
219 {
220  friend class BcfRecord;
221  friend class BcfReader;
222  friend class BcfWriter;
223 
224  private:
225  std::shared_ptr<bcf_hdr_t> hdr; // bcf header
226 
227  public:
228  BcfHeader() = default;
229 
230  ~BcfHeader() = default;
231 
233  BcfHeader(std::string version)
234  {
235  auto h = bcf_hdr_init("w");
236  hdr = std::shared_ptr<bcf_hdr_t>(bcf_hdr_dup(h), details::bcf_hdr_close());
237  bcf_hdr_destroy(h);
238  bcf_hdr_set_version(hdr.get(), version.c_str());
239  }
240 
242  BcfHeader(const bcf_hdr_t * h)
243  {
244  hdr = std::shared_ptr<bcf_hdr_t>(bcf_hdr_dup(h), details::bcf_hdr_close());
245  }
246 
248  bcf_hdr_t * get() const
249  {
250  return hdr.get();
251  }
252 
254  friend std::ostream & operator<<(std::ostream & out, const BcfHeader & h)
255  {
256  out << h.asString();
257  return out;
258  }
259 
260  // TODO: check if the value is valid for vcf specification
261 
268  inline void addINFO(const std::string & id,
269  const std::string & number,
270  const std::string & type,
271  const std::string & description)
272  {
273  addLine("##INFO=<ID=" + id + ",Number=" + number + ",Type=" + type + ",Description=\"" + description
274  + "\">");
275  }
276 
283  inline void addFORMAT(const std::string & id,
284  const std::string & number,
285  const std::string & type,
286  const std::string & description)
287  {
288  addLine("##FORMAT=<ID=" + id + ",Number=" + number + ",Type=" + type + ",Description=\"" + description
289  + "\">");
290  }
291 
297  inline void addFILTER(const std::string & id, const std::string & description)
298  {
299  addLine("##FILTER=<ID=" + id + ",Description=\"" + description + "\">");
300  }
301 
305  inline void addContig(const std::string & id)
306  {
307  addLine("##contig=<ID=" + id + ">");
308  }
309 
313  inline void addLine(const std::string & str)
314  {
315  int ret = 0;
316  ret = bcf_hdr_append(hdr.get(), str.c_str());
317  if(ret != 0) throw std::runtime_error("could not add " + str + " to header\n");
318  ret = bcf_hdr_sync(hdr.get());
319  if(ret != 0) throw std::runtime_error("could not add " + str + " to header\n");
320  }
321 
323  inline void addSample(const std::string & sample) const
324  {
325  bcf_hdr_add_sample(hdr.get(), sample.c_str());
326  if(bcf_hdr_sync(hdr.get()) != 0)
327  {
328  throw std::runtime_error("couldn't add the sample.\n");
329  }
330  }
331 
333  inline std::string asString() const
334  {
335  kstring_t s = {0, 0, nullptr}; // kstring
336  if(bcf_hdr_format(hdr.get(), 0, &s) == 0) // append header string to s.s! append!
337  {
338  std::string out(s.s, s.l);
339  free(s.s);
340  return out;
341  }
342  else
343  {
344  throw std::runtime_error("failed to convert formatted header to string");
345  }
346  }
347 
349  std::vector<std::string> getSamples() const
350  {
351  std::vector<std::string> vec;
352  for(int i = 0; i < bcf_hdr_nsamples(hdr); i++)
353  {
354  vec.emplace_back(hdr->samples[i]);
355  }
356  return vec;
357  }
358 
360  std::vector<std::string> getSeqnames() const
361  {
362  int ret = 0;
363  const char ** seqs = bcf_hdr_seqnames(hdr.get(), &ret);
364  if(ret == 0) printf("there is no contig id in the header!\n");
365  std::vector<std::string> vec;
366  for(int i = 0; i < ret; i++)
367  {
368  vec.emplace_back(seqs[i]);
369  }
370  free(seqs);
371  return vec;
372  }
373 
379  inline int getFormatType(std::string tag) const
380  {
381  int tag_id = bcf_hdr_id2int(hdr.get(), BCF_DT_ID, tag.c_str());
382  if(tag_id < 0) return 0;
383  if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
384  {
385  return 1;
386  }
387  else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
388  {
389  return 2;
390  }
391  else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
392  {
393  return 3;
394  }
395  else
396  {
397  return 0;
398  }
399  }
400 
406  inline int getInfoType(std::string tag) const
407  {
408  int tag_id = bcf_hdr_id2int(hdr.get(), BCF_DT_ID, tag.c_str());
409  if(!bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, tag_id)) return -1; // no such INFO field in the header
410  if(bcf_hdr_id2type(hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
411  {
412  return 1;
413  }
414  else if(bcf_hdr_id2type(hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
415  {
416  return 2;
417  }
418  else if(bcf_hdr_id2type(hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
419  {
420  return 3;
421  }
422  else
423  {
424  return 0;
425  }
426  }
427 
429  inline void removeContig(std::string tag) const
430  {
431  bcf_hdr_remove(hdr.get(), BCF_HL_CTG, tag.c_str());
432  }
433 
435  inline void removeInfo(std::string tag) const
436  {
437  bcf_hdr_remove(hdr.get(), BCF_HL_INFO, tag.c_str());
438  }
439 
441  inline void removeFormat(std::string tag) const
442  {
443  bcf_hdr_remove(hdr.get(), BCF_HL_FMT, tag.c_str());
444  }
445 
447  inline void removeFilter(std::string tag) const
448  {
449  bcf_hdr_remove(hdr.get(), BCF_HL_FLT, tag.c_str());
450  }
451 
457  void updateSamples(const std::string & samples)
458  {
459  auto ss = details::split_string(samples, ",");
460  const int nsamples = nSamples();
461  if(nsamples != (int)ss.size())
462  throw std::runtime_error("You provide either too few or too many samples");
463  kstring_t htxt = {0, 0, nullptr};
464  bcf_hdr_format(hdr.get(), 1, &htxt);
465  // Find the beginning of the #CHROM line
466  int i = htxt.l - 2, ncols = 0;
467  while(i >= 0 && htxt.s[i] != '\n')
468  {
469  if(htxt.s[i] == '\t') ncols++;
470  i--;
471  }
472  if(i < 0 || strncmp(htxt.s + i + 1, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", 45))
473  {
474  if(i > 0 && !strncmp(htxt.s + i + 1, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", 38))
475  throw std::runtime_error("Error: missing FORMAT fields, cowardly refusing to add samples\n");
476  throw std::runtime_error("Could not parse the header: " + std::string(htxt.s));
477  }
478 
479  ncols = 0;
480  while(ncols != 9)
481  {
482  i++;
483  if(htxt.s[i] == '\t') ncols++;
484  }
485  htxt.l = i;
486 
487  // Replace all samples
488  for(i = 0; i < nsamples; i++)
489  {
490  kputc('\t', &htxt);
491  kputs(ss[i].c_str(), &htxt);
492  }
493  kputc('\n', &htxt);
494 
495  auto h = bcf_hdr_init("r");
496  if(bcf_hdr_parse(h, htxt.s) < 0)
497  throw std::runtime_error("An error occurred while parsing the header\n");
498  // destroy the old and make new header
499  hdr = std::shared_ptr<bcf_hdr_t>(bcf_hdr_dup(h), details::bcf_hdr_close()); // deep copy
500  // free everything
501  bcf_hdr_destroy(h);
502  free(htxt.s);
503  }
504 
509  inline void setSamples(const std::string & samples) const
510  {
511  int ret = 0;
512  ret = bcf_hdr_set_samples(hdr.get(), samples.c_str(), 0);
513  if(ret != 0)
514  {
515  throw std::runtime_error("the " + std::to_string(ret)
516  + "-th sample are not in the VCF.\nparameter samples:" + samples);
517  }
518  }
519 
521  inline void setVersion(const std::string & version) const
522  {
523  bcf_hdr_set_version(hdr.get(), version.c_str());
524  }
525 
527  inline int nSamples() const
528  {
529  return bcf_hdr_nsamples(hdr);
530  }
531 };
532 
540 {
541  friend class BcfReader;
542  friend class BcfWriter;
543 
544  private:
545  std::shared_ptr<bcf1_t> line = std::shared_ptr<bcf1_t>(bcf_init(), details::bcf_line_close()); // variant
546  BcfHeader * header = nullptr; // header the record is linked to
547  int32_t * gts = nullptr;
548  int32_t ndst, ret, nsamples;
549  bool noneMissing = true; // whenever parsing a tag have to reset this variable
550  bool isAllPhased = false;
551  int nploidy = 0; // the number of ploidy
552  int nvalues = 0; // the number of values for a tag in FORMAT
553 
554  public:
556  std::vector<char> isGenoMissing;
557 
558  public:
560  BcfRecord() = default;
561 
564  {
565  initHeader(h);
566  }
567 
568  ~BcfRecord()
569  {
570  if(gts) free(gts);
571  }
572 
575  {
576  header = &h;
577  if(!header->hdr) throw std::runtime_error("please initial header first\n");
578  nsamples = header->nSamples();
579  typeOfGT.resize(nsamples);
580  gtPhase.resize(nsamples, 0);
581  }
582 
585  {
586  header = &h;
587  }
588 
590  friend std::ostream & operator<<(std::ostream & out, const BcfRecord & v)
591  {
592  out << v.asString();
593  return out;
594  }
595 
597  inline std::string asString() const
598  {
599  kstring_t s = {0, 0, nullptr}; // kstring
600  if(vcf_format(header->hdr.get(), line.get(), &s) == 0)
601  {
602  std::string out(s.s, s.l);
603  free(s.s);
604  return out;
605  }
606  else
607  {
608  throw std::runtime_error("couldn't format current record into a string.\n");
609  }
610  }
611 
620  template<typename T>
621  isValidGT<T> getGenotypes(T & v)
622  {
623  ndst = 0;
624  ret = bcf_get_genotypes(header->hdr.get(), line.get(), &gts, &ndst);
625  if(ret <= 0)
626  {
627 # if defined(VERBOSE)
628  std::cerr << "GT not present for current site. did you initilize the variant object?\n";
629 # endif
630  return false;
631  }
632  // find the max nploidy using the first variant (eg. 2) resize v as max(nploidy)
633  // * nsamples (ret) NOTE: if ret == nsamples and only one sample then haploid
634  if(ret != nploidy * nsamples && nploidy > 0)
635  {
636  // rare case if nploidy is set manually. eg. only one sample. the first variant is '1' but the
637  // second is '1/0'. ret = 1 but nploidy should be 2
638  v.resize(nploidy * nsamples);
639  }
640  else
641  {
642  v.resize(ret);
643  nploidy = ret / nsamples;
644  }
645  // TODO: work with nploidy == 1, haploid?
646  isGenoMissing.assign(nsamples, 0);
647  int i, j, nphased = 0;
648  noneMissing = true;
649  bcf_fmt_t * fmt = bcf_get_fmt(header->hdr.get(), line.get(), "GT");
650  int nploidy_cur = ret / nsamples; // requires nploidy_cur <= nploidy
651  for(i = 0; i < nsamples; i++)
652  {
653  // check and fill in typeOfGT; only supports SNPs now. check vcfstats.c for inspiration
654  typeOfGT[i] = bcf_gt_type(fmt, i, nullptr, nullptr);
655  if(typeOfGT[i] == GT_UNKN)
656  {
657  noneMissing = false; // set missing as het, user should check if isNoneMissing();
658  isGenoMissing[i] = 1;
659  v[i * nploidy + 0] = 1;
660  for(j = 1; j < nploidy_cur; j++) v[i * nploidy + j] = 0;
661  continue;
662  }
663 
664  for(j = 0; j < nploidy_cur; j++)
665  {
666  // TODO: right now only parse 0 and 1, ie max(nploidy)=2; other values 2,3... will be set to 1
667  v[i * nploidy + j] = bcf_gt_allele(gts[j + i * nploidy_cur]) != 0;
668  }
669  if(nploidy == 2)
670  {
671  gtPhase[i] = (gts[1 + i * nploidy_cur] & 1) == 1;
672  nphased += gtPhase[i];
673  }
674  }
675  if(nphased == nsamples)
676  {
677  isAllPhased = true;
678  }
679  else
680  {
681  isAllPhased = false;
682  }
683  return true;
684  }
685 
693  bool getGenotypes(std::vector<int> & v)
694  {
695  ndst = 0;
696  ret = bcf_get_genotypes(header->hdr.get(), line.get(), &gts, &ndst);
697  if(ret <= 0)
698  {
699 # if defined(VERBOSE)
700  std::cerr << "GT not present for current site. did you initilize the variant object?\n";
701 # endif
702  return false;
703  }
704  v.resize(ret);
705  isGenoMissing.assign(nsamples, 0);
706  nploidy = ret / nsamples;
707  int i, j, nphased = 0;
708  noneMissing = true;
709  for(i = 0; i < nsamples; i++)
710  {
711  int32_t * ptr = gts + i * nploidy;
712  int is_phased = 0;
713  for(j = 0; j < nploidy; j++)
714  {
715  // if true, the sample has smaller ploidy
716  if(ptr[j] == bcf_int32_vector_end) break;
717  // missing allele
718  if(bcf_gt_is_missing(ptr[j]))
719  {
720  noneMissing = false;
721  isGenoMissing[i] = 1;
722  v[i * nploidy + j] = -9;
723  continue;
724  }
725  v[i * nploidy + j] = bcf_gt_allele(ptr[j]);
726  is_phased += bcf_gt_is_phased(ptr[j]);
727  }
728  if(nploidy == is_phased)
729  {
730  gtPhase[i] = true;
731  nphased += gtPhase[i];
732  }
733  }
734  if(nphased == nsamples)
735  {
736  isAllPhased = true;
737  }
738  else
739  {
740  isAllPhased = false;
741  }
742  return true;
743  }
744 
751  template<typename T, typename S = typename T::value_type>
752  isFormatVector<T> getFORMAT(std::string tag, T & v)
753  {
754  bcf_fmt_t * fmt = bcf_get_fmt(header->hdr.get(), line.get(), tag.c_str());
755  if(!fmt)
756  {
757  throw std::invalid_argument("no FORMAT=" + tag + " in the VCF header.\n");
758  }
759  nvalues = fmt->n;
760  ndst = 0;
761  S * dst = NULL;
762  int tagid = header->getFormatType(tag);
763  if(tagid == 1)
764  {
765  ret = bcf_get_format_int32(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
766  }
767  else if(tagid == 2)
768  {
769  ret = bcf_get_format_float(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
770  }
771  else if(tagid == 3)
772  {
773  ret = bcf_get_format_char(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
774  }
775  else
776  {
777  ret = -1;
778  }
779 
780  if(ret >= 0)
781  {
782  // user have to check if there is missing in the return v;
783  v = std::vector<S>(dst, dst + ret);
784  free(dst);
785  return true;
786  }
787  else
788  {
789  free(dst);
790  return false;
791  }
792  }
793 
800  bool getFORMAT(std::string tag, std::vector<std::string> & v)
801  {
802  bcf_fmt_t * fmt = bcf_get_fmt(header->hdr.get(), line.get(), tag.c_str());
803  if(!fmt)
804  {
805  throw std::invalid_argument("no FORMAT=" + tag + " in the VCF header.\n");
806  }
807  nvalues = fmt->n;
808  // if ndst < (fmt->n+1)*nsmpl; then realloc is involved
809  ret = -1, ndst = 0;
810  char ** dst = nullptr;
811  if(header->getFormatType(tag) == 3)
812  ret = bcf_get_format_string(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
813  if(ret > 0)
814  {
815  v.clear();
816  for(int i = 0; i < nsamples; i++)
817  {
818  // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
819  v.emplace_back(dst[i]);
820  };
821  free(dst[0]);
822  free(dst);
823  return true;
824  }
825  else
826  {
827  free(dst[0]);
828  free(dst);
829  return false;
830  }
831  }
832 
839  template<typename T, typename S = typename T::value_type>
840  isInfoVector<T> getINFO(std::string tag, T & v)
841  {
842  bcf_info_t * info = bcf_get_info(header->hdr.get(), line.get(), tag.c_str());
843  if(!info)
844  {
845  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
846  }
847  S * dst = NULL;
848  ndst = 0;
849  ret = -1;
850  if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
851  {
852  ret = bcf_get_info_int32(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
853  }
854  else if(info->type == BCF_BT_FLOAT)
855  {
856  ret = bcf_get_info_float(header->hdr.get(), line.get(), tag.c_str(), &dst, &ndst);
857  }
858  if(ret >= 0)
859  {
860  v = std::vector<S>(dst, dst + ret); // user have to check if there is missing in the return v;
861  free(dst);
862  return true;
863  }
864  else
865  {
866  free(dst);
867  return false;
868  }
869  }
870 
877  template<typename T>
878  isScalar<T> getINFO(std::string tag, T & v)
879  {
880  bcf_info_t * info = bcf_get_info(header->hdr.get(), line.get(), tag.c_str());
881  if(!info)
882  {
883  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
884  }
885  // scalar value
886  if(info->len == 1)
887  {
888  if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
889  {
890  v = info->v1.i;
891  }
892  else if(info->type == BCF_BT_FLOAT)
893  {
894  v = info->v1.f;
895  }
896  return true;
897  }
898  else
899  {
900 # if defined(VERBOSE)
901  std::cerr << "there are multiple values for " + tag
902  + " in INFO for current site. please use vector instead\n";
903 # endif
904  return false;
905  }
906  }
907 
914  template<typename T>
915  isString<T> getINFO(std::string tag, T & v)
916  {
917  bcf_info_t * info = bcf_get_info(header->hdr.get(), line.get(), tag.c_str());
918  if(!info)
919  {
920  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
921  }
922  if(info->type == BCF_BT_CHAR)
923  {
924  v = std::string(reinterpret_cast<char *>(info->vptr), info->vptr_len);
925  return true;
926  }
927  else
928  {
929  return false;
930  }
931  }
932 
939  template<typename T>
940  isScalar<T> setINFO(std::string tag, const T & v)
941  {
942  // bcf_hrec_set_val
943  // bcf_update_info_flag(header.hdr, line, tag.c_str(), NULL, 1);
944  int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
945  if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
946  {
947  ret = bcf_update_info_int32(header->hdr.get(), line.get(), tag.c_str(), &v, 1);
948  }
949  else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
950  {
951  float v2 = static_cast<float>(v);
952  ret = bcf_update_info_float(header->hdr.get(), line.get(), tag.c_str(), &v2, 1);
953  }
954  else
955  {
956  ret = -1;
957  }
958  if(ret < 0)
959  {
960 # if defined(VERBOSE)
961  std::cerr << "couldn't set " + tag + " for this variant.\nplease add the tag in headerfirst.\n";
962 # endif
963  return false;
964  }
965  return true;
966  }
967 
974  template<typename T>
975  isValidInfo<T> setINFO(std::string tag, const T & v)
976  {
977  // bcf_update_info_flag(header.hdr, line, tag.c_str(), NULL, 1);
978  int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
979  if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
980  {
981  ret = bcf_update_info_int32(header->hdr.get(), line.get(), tag.c_str(), v.data(), v.size());
982  }
983  else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
984  {
985  ret = bcf_update_info_float(header->hdr.get(), line.get(), tag.c_str(), v.data(), v.size());
986  }
987  else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
988  {
989  ret = bcf_update_info_string(header->hdr.get(), line.get(), tag.c_str(), v.data());
990  }
991  else
992  {
993  ret = -1;
994  }
995 
996  if(ret < 0)
997  {
998 # if defined(VERBOSE)
999  std::cerr << "couldn't set " + tag + " for this variant.\nplease add the tag in headerfirst.\n";
1000 # endif
1001  return false;
1002  }
1003  return true;
1004  }
1005 
1007  void removeINFO(std::string tag)
1008  {
1009  int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
1010  if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
1011  {
1012  ret = bcf_update_info_int32(header->hdr.get(), line.get(), tag.c_str(), NULL, 0);
1013  }
1014  else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
1015  {
1016  ret = bcf_update_info_float(header->hdr.get(), line.get(), tag.c_str(), NULL, 0);
1017  }
1018  else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
1019  {
1020  ret = bcf_update_info_string(header->hdr.get(), line.get(), tag.c_str(), NULL);
1021  }
1022  else
1023  {
1024  ret = -1;
1025  }
1026 
1027  if(ret < 0)
1028  {
1029  throw std::runtime_error("couldn't remove " + tag + " for this variant.\n");
1030  }
1031  }
1032 
1038  bool setGenotypes(const std::vector<int> & v)
1039  {
1040  // bcf_gt_type
1041  int i, j, k;
1042  nploidy = v.size() / nsamples;
1043  int32_t * gt = (int32_t *)malloc(v.size() * sizeof(int32_t));
1044  for(i = 0; i < nsamples; i++)
1045  {
1046  for(j = 0; j < nploidy; j++)
1047  {
1048  k = i * nploidy + j;
1049  if(v[k] == -9 || v[k] == bcf_int32_missing)
1050  {
1051  gt[k] = bcf_gt_missing;
1052  }
1053  else if(gtPhase[i])
1054  {
1055  gt[k] = bcf_gt_phased(v[k]);
1056  }
1057  else
1058  {
1059  gt[k] = bcf_gt_unphased(v[k]);
1060  }
1061  }
1062  }
1063  if(bcf_update_genotypes(header->hdr.get(), line.get(), gt, v.size()) < 0)
1064  {
1065  free(gt);
1066 # if defined(VERBOSE)
1067  std::cerr << "couldn't set genotypes correctly.\n";
1068 # endif
1069  return false;
1070  }
1071  free(gt);
1072  return true;
1073  }
1074 
1079  void setPhasing(const std::vector<char> & v)
1080  {
1081  if((int)v.size() != nsamples)
1082  throw std::runtime_error("the size of input vector is not matching the size of genotypes");
1083  gtPhase = v;
1084  }
1085 
1087  void removeFORMAT(std::string tag)
1088  {
1089  ret = -1;
1090  int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
1091  if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1092  {
1093  ret = bcf_update_format_int32(header->hdr.get(), line.get(), tag.c_str(), NULL, 0);
1094  }
1095  else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
1096  {
1097  ret = bcf_update_format_char(header->hdr.get(), line.get(), tag.c_str(), NULL, 0);
1098  }
1099  else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1100  {
1101  ret = bcf_update_format_float(header->hdr.get(), line.get(), tag.c_str(), NULL, 0);
1102  }
1103 
1104  if(ret < 0) throw std::runtime_error("couldn't remove " + tag + " correctly.\n");
1105  }
1106 
1113  template<typename T>
1114  isValidFMT<T> setFORMAT(std::string tag, const T & v)
1115  {
1116  int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
1117  if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1118  {
1119  ret = bcf_update_format_int32(header->hdr.get(), line.get(), tag.c_str(), v.data(), v.size());
1120  }
1121  else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
1122  {
1123  ret = bcf_update_format_char(header->hdr.get(), line.get(), tag.c_str(), v.data(), v.size());
1124  }
1125  else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1126  {
1127  ret = bcf_update_format_float(header->hdr.get(), line.get(), tag.c_str(), v.data(), v.size());
1128  }
1129  else
1130  {
1131  ret = -1;
1132  }
1133 
1134  if(ret < 0)
1135  {
1136 # if defined(VERBOSE)
1137  std::cerr << "couldn't set format " + tag + " corectly.\n";
1138 # endif
1139  return false;
1140  }
1141  return true;
1142  }
1143 
1151  template<typename T>
1152  isScalar<T> setFORMAT(std::string tag, const T & v)
1153  {
1154  float v2 = v;
1155  int tag_id = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, tag.c_str());
1156  if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1157  {
1158  ret = bcf_update_format_int32(header->hdr.get(), line.get(), tag.c_str(), &v, 1);
1159  }
1160  else if(bcf_hdr_id2type(header->hdr.get(), BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1161  {
1162  ret = bcf_update_format_float(header->hdr.get(), line.get(), tag.c_str(), &v2, 1);
1163  }
1164  else
1165  {
1166  ret = -1;
1167  }
1168  if(ret < 0)
1169  {
1170 # if defined(VERBOSE)
1171  std::cerr << "couldn't set format " + tag + " corectly.\n";
1172 # endif
1173  return false;
1174  }
1175  return true;
1176  }
1177 
1179  void addLineFromString(const std::string & vcfline)
1180  {
1181  kstring_t s = {0, 0, nullptr};
1182  kputsn(vcfline.c_str(), vcfline.length(), &s);
1183  ret = vcf_parse1(&s, header->hdr.get(), line.get());
1184  free(s.s);
1185  if(ret > 0) throw std::runtime_error("error parsing: " + vcfline + "\n");
1186  if(line->errcode == BCF_ERR_CTG_UNDEF)
1187  {
1188  std::string contig(bcf_hdr_id2name(header->hdr.get(), line->rid));
1189  auto hdr_d = bcf_hdr_dup(header->hdr.get());
1190  auto hrec = bcf_hdr_id2hrec(hdr_d, BCF_DT_CTG, 0, line->rid);
1191  bcf_hdr_destroy(hdr_d);
1192  if(hrec == nullptr)
1193  throw std::runtime_error("contig" + contig + " unknow and not found in the header.\n");
1194  ret = bcf_hdr_add_hrec(header->hdr.get(), hrec);
1195  bcf_hrec_destroy(hrec);
1196  if(ret < 0) throw std::runtime_error("error adding contig " + contig + " to header.\n");
1197  ret = bcf_hdr_sync(header->hdr.get());
1198  }
1199  }
1200 
1202  inline bool isNoneMissing() const
1203  {
1204  return noneMissing;
1205  }
1206 
1208  inline bool isSV() const
1209  {
1210  if(bcf_get_info(header->hdr.get(), line.get(), "SVTYPE") == nullptr)
1211  {
1212  return false;
1213  }
1214  else
1215  {
1216  return true;
1217  }
1218  }
1219 
1221  inline bool isIndel() const
1222  {
1223  // REF has multiple allels
1224  if(REF().length() > 1 && !isSV()) return true;
1225  for(int i = 1; i < line->n_allele; i++)
1226  {
1227  std::string alt(line->d.allele[i]);
1228  if(alt == ".") return true;
1229  if(alt.length() != REF().length() && !isSV()) return true;
1230  }
1231  return false;
1232  }
1233 
1235  inline bool isMultiAllelics() const
1236  {
1237  if(line->n_allele <= 2) return false;
1238  return true;
1239  }
1240 
1242  inline bool isMultiAllelicSNP() const
1243  {
1244  // skip REF with length > 1, i.e. INDEL
1245  if(REF().length() > 1 || line->n_allele <= 2) return false;
1246  for(int i = 1; i < line->n_allele; i++)
1247  {
1248  std::string snp(line->d.allele[i]);
1249  if(snp.length() != 1)
1250  {
1251  return false;
1252  }
1253  }
1254  return true;
1255  }
1256 
1259  inline bool isSNP() const
1260  {
1261  // REF and ALT have multiple allels
1262  if(REF().length() > 1 || line->n_allele > 2) return false;
1263  std::string snp(line->d.allele[1]);
1264  if(!(snp == "A" || snp == "C" || snp == "G" || snp == "T"))
1265  {
1266  return false;
1267  }
1268  return true;
1269  }
1270 
1273  inline bool hasSNP() const
1274  {
1275  int type = bcf_has_variant_types(line.get(), VCF_SNP, bcf_match_overlap);
1276  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1277  if(type == 0) return false;
1278  return true;
1279  }
1280 
1282  inline bool hasINDEL() const
1283  {
1284  int type = bcf_has_variant_types(line.get(), VCF_INDEL, bcf_match_overlap);
1285  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1286  if(type == 0) return false;
1287  return true;
1288  }
1289 
1291  inline bool hasINS() const
1292  {
1293  int type = bcf_has_variant_types(line.get(), VCF_INS, bcf_match_overlap);
1294  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1295  if(type == 0) return false;
1296  return true;
1297  }
1298 
1300  inline bool hasDEL() const
1301  {
1302  int type = bcf_has_variant_types(line.get(), VCF_DEL, bcf_match_overlap);
1303  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1304  if(type == 0) return false;
1305  return true;
1306  }
1307 
1309  inline bool hasMNP() const
1310  {
1311  int type = bcf_has_variant_types(line.get(), VCF_MNP, bcf_match_overlap);
1312  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1313  if(type == 0) return false;
1314  return true;
1315  }
1316 
1318  inline bool hasBND() const
1319  {
1320  int type = bcf_has_variant_types(line.get(), VCF_BND, bcf_match_overlap);
1321  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1322  if(type == 0) return false;
1323  return true;
1324  }
1325 
1327  inline bool hasOTHER() const
1328  {
1329  int type = bcf_has_variant_types(line.get(), VCF_OTHER, bcf_match_overlap);
1330  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1331  if(type == 0) return false;
1332  return true;
1333  }
1334 
1336  inline bool hasOVERLAP() const
1337  {
1338  int type = bcf_has_variant_types(line.get(), VCF_OVERLAP, bcf_match_overlap);
1339  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1340  if(type == 0) return false;
1341  return true;
1342  }
1343 
1345  inline std::string CHROM() const
1346  {
1347  return std::string(bcf_hdr_id2name(header->hdr.get(), line->rid));
1348  }
1349 
1351  inline std::string ID() const
1352  {
1353  return std::string(line->d.id);
1354  }
1355 
1357  inline int64_t POS() const
1358  {
1359  return line->pos + 1;
1360  }
1361 
1363  inline void setCHR(const std::string & s)
1364  {
1365  line->rid = bcf_hdr_name2id(header->hdr.get(), s.c_str());
1366  }
1367 
1369  inline void setPOS(int64_t p)
1370  {
1371  line->pos = p - 1;
1372  }
1373 
1375  inline void setID(const std::string & s)
1376  {
1377  bcf_update_id(header->hdr.get(), line.get(), s.c_str());
1378  }
1379 
1381  inline void setRefAlt(const std::string & s)
1382  {
1383  bcf_update_alleles_str(header->hdr.get(), line.get(), s.c_str());
1384  }
1385 
1387  inline void setQUAL(float q)
1388  {
1389  line->qual = q;
1390  }
1391 
1393  inline void setQUAL(char q)
1394  {
1395  bcf_float_set_missing(line->qual);
1396  }
1397 
1399  inline void setFILTER(const std::string & s)
1400  {
1401  int32_t tmpi = bcf_hdr_id2int(header->hdr.get(), BCF_DT_ID, s.c_str());
1402  bcf_add_filter(header->hdr.get(), line.get(), tmpi);
1403  }
1404 
1406  inline int64_t Start() const
1407  {
1408  return line->pos;
1409  }
1410 
1412  inline int64_t End() const
1413  {
1414  return line->pos + line->rlen;
1415  }
1416 
1418  inline std::string REF() const
1419  {
1420  return std::string(line->d.allele[0]);
1421  }
1422 
1424  inline void swap_REF_ALT()
1425  {
1426  if(!isMultiAllelicSNP()) std::swap(line->d.allele[0], line->d.allele[1]);
1427  }
1428 
1430  inline std::string ALT() const
1431  {
1432  std::string s;
1433  for(int i = 1; i < line->n_allele; i++)
1434  {
1435  s += std::string(line->d.allele[i]) + ",";
1436  }
1437  if(s.length() > 1) s.pop_back();
1438  return s;
1439  }
1440 
1442  inline float QUAL()
1443  {
1444  if(bcf_float_is_missing(line->qual))
1445  {
1446  noneMissing = false;
1447  return bcf_float_missing;
1448  }
1449  else
1450  {
1451  return line->qual;
1452  }
1453  }
1454 
1456  inline std::string FILTER()
1457  {
1458  if(line->d.n_flt == 0)
1459  {
1460  return ".";
1461  }
1462  else if(line->d.n_flt == 1)
1463  {
1464  return std::string(bcf_hdr_int2id(header->hdr.get(), BCF_DT_ID, line->d.flt[0]));
1465  }
1466  else
1467  {
1468  std::string s;
1469  for(int i = 1; i < line->d.n_flt; i++)
1470  {
1471  s += std::string(bcf_hdr_int2id(header->hdr.get(), BCF_DT_ID, line->d.flt[i])) + ",";
1472  }
1473  s.pop_back();
1474  return s;
1475  }
1476  }
1477 
1479  inline std::string allINFO()
1480  {
1481  int32_t max_dt_id = header->hdr->n[BCF_DT_ID];
1482  kstring_t * s = (kstring_t *)calloc(1, sizeof(kstring_t));
1483  if(line->n_info)
1484  {
1485  int first = 1;
1486  for(int i = 0; i < line->n_info; ++i)
1487  {
1488  bcf_info_t * z = &line->d.info[i];
1489  if(!z->vptr) continue;
1490  if(!first) kputc(';', s);
1491  first = 0;
1492  if(z->key < 0 || z->key >= max_dt_id || header->hdr->id[BCF_DT_ID][z->key].key == NULL)
1493  throw std::runtime_error("Invalid BCF and wrong INFO tag");
1494  kputs(header->hdr->id[BCF_DT_ID][z->key].key, s);
1495  if(z->len <= 0) continue;
1496  kputc('=', s);
1497  if(z->len == 1)
1498  {
1499  switch(z->type)
1500  {
1501  case BCF_BT_INT8:
1502  if(z->v1.i == bcf_int8_missing)
1503  kputc('.', s);
1504  else
1505  kputw(z->v1.i, s);
1506  break;
1507  case BCF_BT_INT16:
1508  if(z->v1.i == bcf_int16_missing)
1509  kputc('.', s);
1510  else
1511  kputw(z->v1.i, s);
1512  break;
1513  case BCF_BT_INT32:
1514  if(z->v1.i == bcf_int32_missing)
1515  kputc('.', s);
1516  else
1517  kputw(z->v1.i, s);
1518  break;
1519  case BCF_BT_INT64:
1520  if(z->v1.i == bcf_int64_missing)
1521  kputc('.', s);
1522  else
1523  kputll(z->v1.i, s);
1524  break;
1525  case BCF_BT_FLOAT:
1526  if(bcf_float_is_missing(z->v1.f))
1527  kputc('.', s);
1528  else
1529  kputd(z->v1.f, s);
1530  break;
1531  case BCF_BT_CHAR:
1532  kputc(z->v1.i, s);
1533  break;
1534  default:
1535  throw std::runtime_error("Unexpected type in INFO");
1536  }
1537  }
1538  else
1539  bcf_fmt_array(s, z->len, z->type, z->vptr);
1540  }
1541  if(first) kputc('.', s);
1542  }
1543  else
1544  kputc('.', s);
1545  std::string out = std::string(s->s, s->l);
1546  free(s->s);
1547  free(s);
1548  return out;
1549  }
1550 
1552  inline bool allPhased() const
1553  {
1554  return isAllPhased;
1555  }
1556 
1558  inline int ploidy() const
1559  {
1560  return nploidy;
1561  }
1562 
1564  inline void setPloidy(int v)
1565  {
1566  nploidy = v;
1567  }
1568 
1580  std::vector<char> typeOfGT;
1581 
1583  std::vector<char> gtPhase;
1584 };
1585 
1591 {
1592  private:
1593  std::shared_ptr<htsFile> fp; // hts file
1594  std::shared_ptr<hts_idx_t> hidx; // hts index file
1595  std::shared_ptr<tbx_t> tidx; // .tbi .csi index file for vcf files
1596  std::shared_ptr<hts_itr_t> itr; // hts iterator
1597  std::string fname;
1598  bool isBcf = false; // if the input file is bcf or vcf;
1599 
1600  public:
1606  std::vector<std::string> SamplesName;
1607 
1608  public:
1609  ~BcfReader() = default;
1610 
1612  BcfReader() = default;
1613 
1618  BcfReader(const std::string & file) : fname(file)
1619  {
1620  open(file);
1621  }
1622 
1628  BcfReader(const std::string & file, const std::string & region) : fname(file)
1629  {
1630  open(file);
1631  if(file != "-" && !region.empty()) setRegion(region);
1633  }
1634 
1645  BcfReader(const std::string & file, const std::string & region, const std::string & samples) : fname(file)
1646  {
1647  open(file);
1648  if(file != "-" && !region.empty()) setRegion(region);
1649  if(!samples.empty()) setSamples(samples);
1650  }
1651 
1653  void close()
1654  {
1655  if(fp) fp.reset();
1656  if(hidx) hidx.reset();
1657  if(itr) itr.reset();
1658  if(tidx) tidx.reset();
1659  }
1660 
1662  void open(const std::string & file)
1663  {
1664  if(!fname.empty() && fname != file)
1665  throw std::runtime_error("does not support re-open a file yet. " + fname);
1666  fname = file;
1667  fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), "r"), details::hts_file_close());
1668  if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
1669  enum htsExactFormat hts_format = hts_get_format(fp.get())->format;
1670  if(hts_format == bcf) isBcf = true;
1671  auto h = bcf_hdr_read(fp.get());
1672  header = BcfHeader(h); // make a deep copy
1673  bcf_hdr_destroy(h);
1674  nsamples = header.nSamples();
1676  if(file == "-") return;
1677  if(isBcf)
1678  {
1679  hidx = std::shared_ptr<hts_idx_t>(bcf_index_load(fname.c_str()), details::hts_idx_close());
1680  }
1681  else
1682  {
1683  tidx = std::shared_ptr<tbx_t>(tbx_index_load(fname.c_str()), details::tabix_idx_close());
1684  }
1685  }
1686 
1688  inline int setThreads(int n)
1689  {
1690  return hts_set_threads(fp.get(), n);
1691  }
1692 
1701  int getStatus(const std::string & region)
1702  {
1703  try
1704  {
1705  setRegion(region);
1706  BcfRecord v(header);
1707  if(!getNextVariant(v)) return 0;
1708  }
1709  catch(const std::invalid_argument & e)
1710  {
1711  return -1;
1712  }
1713  catch(const std::runtime_error & e)
1714  {
1715  return -2;
1716  }
1717  return 1;
1718  }
1719 
1725  int getVariantsCount(const std::string & region)
1726  {
1727  int c{0};
1728  setRegion(region);
1729  BcfRecord v(header);
1730  while(getNextVariant(v)) c++;
1731  return c;
1732  }
1733 
1739  void setSamples(const std::string & samples)
1740  {
1741  header.setSamples(samples);
1742  nsamples = bcf_hdr_nsamples(header.hdr);
1744  }
1745 
1753  void setRegion(std::string region)
1754  {
1755  std::string::size_type n;
1756  // if region is chr:pos, turn it into chr:pos-pos
1757  if((n = region.find('-')) == std::string::npos)
1758  {
1759  if((n = region.find(':')) != std::string::npos)
1760  {
1761  region += "-" + region.substr(n + 1, std::string::npos);
1762  }
1763  }
1764  // 1. check and load index first
1765  // 2. query iterval region
1766  // 3. if region is empty, use "."
1767  if(isBcf)
1768  {
1769  if(itr) itr.reset(); // reset current region.
1770  if(region.empty())
1771  itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(), header.hdr.get(), "."),
1773  else
1774  itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(), header.hdr.get(), region.c_str()),
1776  }
1777  else
1778  {
1779  if(tidx.get() == nullptr) throw std::invalid_argument(" no tabix index found!");
1780  if(itr) itr.reset(); // reset
1781  if(region.empty())
1782  itr = std::shared_ptr<hts_itr_t>(tbx_itr_querys(tidx.get(), "."), details::hts_iter_close());
1783  else
1784  itr = std::shared_ptr<hts_itr_t>(tbx_itr_querys(tidx.get(), region.c_str()),
1786  }
1787  if(itr.get() == nullptr)
1788  throw std::runtime_error("region was not found! make sure the region format is correct");
1789  }
1790 
1794  {
1795  int ret = -1;
1796  if(itr.get() != nullptr)
1797  {
1798  if(isBcf)
1799  {
1800  ret = bcf_itr_next(fp.get(), itr.get(), r.line.get());
1801  bcf_subset_format(r.header->hdr.get(), r.line.get()); // has to be called explicitly for bcf
1802  bcf_unpack(r.line.get(), BCF_UN_ALL);
1803  return (ret >= 0);
1804  }
1805  else
1806  {
1807  kstring_t s = {0, 0, nullptr}; // kstring
1808  int slen = tbx_itr_next(fp.get(), tidx.get(), itr.get(), &s);
1809  if(slen > 0)
1810  {
1811  ret = vcf_parse1(&s, r.header->hdr.get(), r.line.get()); // ret > 0, error
1812  bcf_unpack(r.line.get(), BCF_UN_ALL);
1813  }
1814  if(s.s) free(s.s);
1815  return (ret <= 0) && (slen > 0);
1816  }
1817  }
1818  else
1819  {
1820  ret = bcf_read(fp.get(), r.header->hdr.get(), r.line.get());
1821  // unpack record immediately. not lazy
1822  bcf_unpack(r.line.get(), BCF_UN_ALL);
1823  return (ret == 0);
1824  }
1825  }
1826 };
1827 
1833 {
1834  private:
1835  std::shared_ptr<htsFile> fp; // hts file
1836  std::shared_ptr<bcf1_t> b = std::shared_ptr<bcf1_t>(bcf_init(), details::bcf_line_close()); // variant
1837  int ret;
1838  bool isHeaderWritten = false;
1839  // std::shared_ptr<BcfHeader> hp;
1840 
1841  public:
1844 
1845  public:
1846  ~BcfWriter() = default;
1847 
1849  BcfWriter() = default;
1850 
1856  BcfWriter(const std::string & fname, std::string version = "VCF4.1")
1857  {
1858  open(fname);
1859  header = BcfHeader(version);
1860  }
1861 
1867  BcfWriter(const std::string & fname, const BcfHeader & h)
1868  {
1869  open(fname);
1870  initalHeader(h);
1871  }
1872 
1883  BcfWriter(const std::string & fname, const std::string & version, const std::string & mode)
1884  {
1885  open(fname, mode);
1886  header = BcfHeader(version);
1887  }
1888 
1899  BcfWriter(const std::string & fname, const BcfHeader & h, const std::string & mode)
1900  {
1901  open(fname, mode);
1902  initalHeader(h);
1903  }
1904 
1909  void open(const std::string & fname)
1910  {
1911  auto mode = details::getMode(fname, "w");
1912  fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close());
1913  if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
1914  }
1915 
1925  void open(const std::string & fname, const std::string & mode)
1926  {
1927  fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close());
1928  if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
1929  }
1930 
1932  void close()
1933  {
1934  if(!isHeaderWritten) writeHeader();
1935  if(b) b.reset();
1936  if(fp) fp.reset();
1937  }
1938 
1940  void initalHeader(const BcfHeader & h)
1941  {
1942  header = h;
1943  }
1944 
1946  void writeLine(const std::string & vcfline)
1947  {
1948  if(!isHeaderWritten && !writeHeader()) throw std::runtime_error("could not write header\n");
1949  kstring_t s = {0, 0, nullptr};
1950  kputsn(vcfline.c_str(), vcfline.length(), &s);
1951  ret = vcf_parse1(&s, header.hdr.get(), b.get());
1952  free(s.s);
1953  if(ret > 0) throw std::runtime_error("error parsing: " + vcfline + "\n");
1954  if(b->errcode == BCF_ERR_CTG_UNDEF)
1955  {
1956  throw std::runtime_error("contig id " + (std::string)bcf_hdr_id2name(header.hdr.get(), b->rid)
1957  + " not found in the header. please run header->AddContig() first.\n");
1958  }
1959  ret = bcf_write(fp.get(), header.hdr.get(), b.get());
1960  if(ret != 0) throw std::runtime_error("error writing: " + vcfline + "\n");
1961  }
1962 
1965  {
1966  ret = bcf_hdr_write(fp.get(), header.hdr.get());
1967  if(ret == 0) return isHeaderWritten = true;
1968  return false;
1969  }
1970 
1973  {
1974  if(!isHeaderWritten) writeHeader();
1975  if(bcf_write(fp.get(), v.header->hdr.get(), v.line.get()) < 0) return false;
1976  return true;
1977  }
1978 };
1979 
1980 } // namespace vcfpp
1981 
1982 #endif // VCFPP_H_
Object represents header of the VCF, offering methods to access and modify the tags.
Definition: vcfpp.h:219
std::string asString() const
return header as a string
Definition: vcfpp.h:333
void removeFormat(std::string tag) const
remove a FORMAT tag from header
Definition: vcfpp.h:441
void addContig(const std::string &id)
add contig to header
Definition: vcfpp.h:305
void addFILTER(const std::string &id, const std::string &description)
add one FILTER line to header
Definition: vcfpp.h:297
int getFormatType(std::string tag) const
get the type of a given tag
Definition: vcfpp.h:379
void addSample(const std::string &sample) const
add one sample name to header
Definition: vcfpp.h:323
BcfHeader(const bcf_hdr_t *h)
make a copy from existing header
Definition: vcfpp.h:242
void addFORMAT(const std::string &id, const std::string &number, const std::string &type, const std::string &description)
add FORMAT field to header
Definition: vcfpp.h:283
void setVersion(const std::string &version) const
set the VCF version
Definition: vcfpp.h:521
void removeInfo(std::string tag) const
remove a INFO tag from header
Definition: vcfpp.h:435
void setSamples(const std::string &samples) const
explicitly set samples to be extracted
Definition: vcfpp.h:509
BcfHeader(std::string version)
initial a VCF header using the internal template given a specific version.
Definition: vcfpp.h:233
void addLine(const std::string &str)
add one line to header
Definition: vcfpp.h:313
std::vector< std::string > getSeqnames() const
return all contig/chromosome names in a string vector
Definition: vcfpp.h:360
void removeContig(std::string tag) const
remove a contig tag from header
Definition: vcfpp.h:429
int getInfoType(std::string tag) const
get the type of a given tag
Definition: vcfpp.h:406
std::vector< std::string > getSamples() const
return all samples in a string vector
Definition: vcfpp.h:349
int nSamples() const
return the number of samples in the VCF
Definition: vcfpp.h:527
bcf_hdr_t * get() const
Return the raw bcf_hdr_t.
Definition: vcfpp.h:248
friend std::ostream & operator<<(std::ostream &out, const BcfHeader &h)
stream out the header
Definition: vcfpp.h:254
void removeFilter(std::string tag) const
remove a FILTER tag from header
Definition: vcfpp.h:447
void updateSamples(const std::string &samples)
update the sample id in the header
Definition: vcfpp.h:457
void addINFO(const std::string &id, const std::string &number, const std::string &type, const std::string &description)
add INFO field to header
Definition: vcfpp.h:268
Stream in variants from compressed/uncompressed VCF/BCF file or stdin.
Definition: vcfpp.h:1591
void setSamples(const std::string &samples)
explicitly stream to specific samples
Definition: vcfpp.h:1739
BcfReader()=default
Construct an empty BcfReader.
void open(const std::string &file)
Open a VCF/BCF/STDIN file for streaming in.
Definition: vcfpp.h:1662
BcfReader(const std::string &file, const std::string &region, const std::string &samples)
construct a vcf/bcf reader with subset samples in target region
Definition: vcfpp.h:1645
BcfHeader header
a BcfHeader object
Definition: vcfpp.h:1602
int setThreads(int n)
set the number of threads to use
Definition: vcfpp.h:1688
BcfReader(const std::string &file, const std::string &region)
construct a vcf/bcf reader with subset samples
Definition: vcfpp.h:1628
void setRegion(std::string region)
explicitly stream to specific region. throw invalid_argument error if index file not found....
Definition: vcfpp.h:1753
bool getNextVariant(BcfRecord &r)
read in the next variant
Definition: vcfpp.h:1793
int nsamples
number of samples in the VCF
Definition: vcfpp.h:1604
BcfReader(const std::string &file)
construct a vcf/bcf reader from file.
Definition: vcfpp.h:1618
void close()
Close the VCF file and its associated files.
Definition: vcfpp.h:1653
int getVariantsCount(const std::string &region)
count the number of variants by iterating through a given region.
Definition: vcfpp.h:1725
int getStatus(const std::string &region)
query the status of a given region in the VCF
Definition: vcfpp.h:1701
std::vector< std::string > SamplesName
a vector of samples name in the VCF
Definition: vcfpp.h:1606
Object represents a variant record in the VCF, offering methods to access and modify fields.
Definition: vcfpp.h:540
float QUAL()
return QUAL value
Definition: vcfpp.h:1442
bool isIndel() const
return boolean value indicates if current variant is exclusively INDEL
Definition: vcfpp.h:1221
bool hasINDEL() const
return boolean value indicates if current variant has INDEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1282
void setRefAlt(const std::string &s)
set REF and ALT alleles given a string seperated by comma
Definition: vcfpp.h:1381
isValidGT< T > getGenotypes(T &v)
fill in the input vector with genotypes of 0 and 1. only works for ploidy<=2. Genotypes with missing ...
Definition: vcfpp.h:621
void setPOS(int64_t p)
modify position given 1-based value
Definition: vcfpp.h:1369
bool hasINS() const
return boolean value indicates if current variant has INS type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1291
std::string allINFO()
return raw INFO column as string. recommend to use getINFO for specific tag.
Definition: vcfpp.h:1479
int ploidy() const
return the number of ploidy of current variant
Definition: vcfpp.h:1558
void removeINFO(std::string tag)
remove the given tag from INFO of the variant
Definition: vcfpp.h:1007
BcfRecord(BcfHeader &h)
constructor with a given BcfHeader object
Definition: vcfpp.h:563
std::string CHROM() const
return CHROM name
Definition: vcfpp.h:1345
void setID(const std::string &s)
update ID
Definition: vcfpp.h:1375
isValidFMT< T > setFORMAT(std::string tag, const T &v)
set tag values for all samples in FORMAT using given vector
Definition: vcfpp.h:1114
int64_t POS() const
return 1-base position
Definition: vcfpp.h:1357
std::string ID() const
return ID field
Definition: vcfpp.h:1351
isScalar< T > setFORMAT(std::string tag, const T &v)
set tag for a single sample in FORMAT using given singular value. this works only when there is one s...
Definition: vcfpp.h:1152
bool isSV() const
return boolean value indicates if current variant is Structual Variant or not
Definition: vcfpp.h:1208
void setQUAL(float q)
modify the QUAL value
Definition: vcfpp.h:1387
bool hasSNP() const
return boolean value indicates if current variant has SNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1273
std::vector< char > isGenoMissing
if there is "." in GT for the sample, then it's coded as missing (TRUE)
Definition: vcfpp.h:556
std::string FILTER()
return raw FILTER column as string
Definition: vcfpp.h:1456
void swap_REF_ALT()
swap REF and ALT for biallelic SNP
Definition: vcfpp.h:1424
std::string REF() const
return raw REF alleles as string
Definition: vcfpp.h:1418
std::string asString() const
return current variant as raw string
Definition: vcfpp.h:597
int64_t Start() const
return 0-base start of the variant (can be any type)
Definition: vcfpp.h:1406
bool hasBND() const
return boolean value indicates if current variant has BND type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1318
friend std::ostream & operator<<(std::ostream &out, const BcfRecord &v)
stream out the variant
Definition: vcfpp.h:590
isScalar< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:940
void resetHeader(BcfHeader &h)
reset the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:584
bool hasMNP() const
return boolean value indicates if current variant has MNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1309
bool allPhased() const
return boolean value indicates if genotypes of all samples are phased
Definition: vcfpp.h:1552
bool hasDEL() const
return boolean value indicates if current variant has DEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1300
bool isMultiAllelicSNP() const
return boolean value indicates if current variant is exclusively multiallelic SNP sites
Definition: vcfpp.h:1242
void setPhasing(const std::vector< char > &v)
set phasing status for all diploid samples using given vector
Definition: vcfpp.h:1079
bool setGenotypes(const std::vector< int > &v)
set genotypes from scratch even if genotypes not present
Definition: vcfpp.h:1038
std::string ALT() const
return raw ALT alleles as string
Definition: vcfpp.h:1430
bool hasOTHER() const
return boolean value indicates if current variant has OTHER type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1327
isString< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:915
bool isSNP() const
return boolean value indicates if current variant is exclusively biallelic SNP. Note ALT=* are skippe...
Definition: vcfpp.h:1259
bool isMultiAllelics() const
return boolean value indicates if current variant is multiallelic sites
Definition: vcfpp.h:1235
void setCHR(const std::string &s)
modify CHROM value
Definition: vcfpp.h:1363
int64_t End() const
return 0-base end of the variant (can be any type)
Definition: vcfpp.h:1412
void setPloidy(int v)
in a rare case, one may want to set the number of ploidy manually
Definition: vcfpp.h:1564
isFormatVector< T > getFORMAT(std::string tag, T &v)
get tag value in FORMAT
Definition: vcfpp.h:752
void initHeader(BcfHeader &h)
initilize the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:574
void addLineFromString(const std::string &vcfline)
add one variant record from given string
Definition: vcfpp.h:1179
bool getFORMAT(std::string tag, std::vector< std::string > &v)
get tag value in FORMAT
Definition: vcfpp.h:800
bool isNoneMissing() const
if all samples have non missing values for the tag in FORMAT
Definition: vcfpp.h:1202
void setQUAL(char q)
modify the QUAL value
Definition: vcfpp.h:1393
void setFILTER(const std::string &s)
modify the FILTER value
Definition: vcfpp.h:1399
isValidInfo< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:975
BcfRecord()=default
empty constructor. call init() afterwards
std::vector< char > gtPhase
vector of nsamples length. keep track of the phasing status of each sample
Definition: vcfpp.h:1583
void removeFORMAT(std::string tag)
remove the given tag from FORMAT of the variant
Definition: vcfpp.h:1087
std::vector< char > typeOfGT
vector of nsamples length. keep track of the type of genotype (one of GT_HOM_RR, GT_HET_RA,...
Definition: vcfpp.h:1580
isInfoVector< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:840
bool getGenotypes(std::vector< int > &v)
fill in the input vector with genotype values, 0, 1 or -9 (missing).
Definition: vcfpp.h:693
bool hasOVERLAP() const
return boolean value indicates if current variant has OVERLAP type defined in vcf....
Definition: vcfpp.h:1336
isScalar< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:878
Stream out variants to compressed/uncompressed VCF/BCF file or stdout.
Definition: vcfpp.h:1833
void close()
close the BcfWriter object.
Definition: vcfpp.h:1932
void writeLine(const std::string &vcfline)
copy a string to a vcf line
Definition: vcfpp.h:1946
BcfWriter(const std::string &fname, const std::string &version, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1883
BcfWriter(const std::string &fname, std::string version="VCF4.1")
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1856
void open(const std::string &fname)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1909
BcfWriter(const std::string &fname, const BcfHeader &h)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1867
void initalHeader(const BcfHeader &h)
initial a VCF header by pointing to header of another VCF
Definition: vcfpp.h:1940
BcfWriter(const std::string &fname, const BcfHeader &h, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1899
BcfWriter()=default
Construct an empty BcfWriter.
BcfHeader header
header object initialized by initalHeader
Definition: vcfpp.h:1843
void open(const std::string &fname, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1925
bool writeRecord(BcfRecord &v)
streams out the given variant of BcfRecord type
Definition: vcfpp.h:1972
bool writeHeader()
streams out the header
Definition: vcfpp.h:1964
Definition: vcfpp.h:204
Definition: vcfpp.h:195
Definition: vcfpp.h:159
Definition: vcfpp.h:177
Definition: vcfpp.h:168
Definition: vcfpp.h:186