vcfpp
vcfpp.h
1 /*******************************************************************************
2  * @file https://github.com/Zilong-Li/vcfpp/vcfpp.h
3  * @author Zilong Li
4  * @email zilong.dk@gmail.com
5  * @version v0.6.0
6  * @breif a single C++ file for manipulating VCF
7  * Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file.
8  ******************************************************************************/
9 
36 #pragma once
37 
38 #include <cstddef>
39 #include <stdexcept>
40 #ifndef VCFPP_H_
41 # define VCFPP_H_
42 
43 # include <iostream>
44 # include <memory>
45 # include <string>
46 # include <type_traits>
47 # include <vector>
48 
49 // make sure you have htslib installed
50 extern "C"
51 {
52 # include <htslib/hts.h>
53 # include <htslib/kstring.h>
54 # include <htslib/tbx.h>
55 # include <htslib/vcf.h>
56 # include <htslib/vcfutils.h>
57 }
58 
59 namespace vcfpp
60 {
61 template<typename T>
62 using isValidFMT =
63  typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<char>>::value
64  || std::is_same<T, std::vector<int>>::value
65  || std::is_same<T, std::vector<float>>::value,
66  bool>::type;
67 
68 template<typename T>
69 using isValidInfo =
70  typename std::enable_if<std::is_same<T, std::string>::value || std::is_same<T, std::vector<int>>::value
71  || std::is_same<T, std::vector<float>>::value,
72  bool>::type;
73 
74 template<typename T>
75 using isInfoVector = typename std::enable_if<std::is_same<T, std::vector<int>>::value
76  || std::is_same<T, std::vector<float>>::value,
77  bool>::type;
78 
79 template<typename T>
80 using isScalar = typename std::enable_if<std::is_same<T, int>::value || std::is_same<T, float>::value
81  || std::is_same<T, double>::value,
82  bool>::type;
83 
84 template<typename T>
85 using isString = typename std::enable_if<std::is_same<T, std::string>::value, bool>::type;
86 
87 template<typename T>
88 using isValidGT = typename std::enable_if<std::is_same<T, std::vector<bool>>::value
89  || std::is_same<T, std::vector<char>>::value,
90  bool>::type;
91 
92 template<typename T>
93 using isFormatVector = typename std::enable_if<std::is_same<T, std::vector<float>>::value
94  || std::is_same<T, std::vector<char>>::value
95  || std::is_same<T, std::vector<int>>::value,
96  bool>::type;
97 
98 namespace details
99 {
100 
101 template<typename T>
102 isScalar<T> isMissing(T const & v)
103 {
104  if(std::is_same<T, float>::value)
105  {
106  return bcf_float_is_missing(v);
107  }
108  else if(std::is_same<T, int>::value)
109  {
110  return bcf_int32_missing(v);
111  }
112 }
113 
114 // test if a string is end with another string
115 inline bool isEndWith(std::string const & s, std::string const & e)
116 {
117  if(s.length() >= e.length())
118  {
119  return (0 == s.compare(s.length() - e.length(), e.length(), e));
120  }
121  else
122  {
123  return false;
124  }
125 }
126 
127 // determinate the mode for read/write the compressed/uncompressed VCF/BCF
128 inline std::string getMode(std::string const & fname, std::string mode = "r")
129 {
130  if(isEndWith(fname, "bcf.gz")) mode += "b";
131  if(isEndWith(fname, "bcf")) mode += "bu";
132  if(isEndWith(fname, "vcf.gz")) mode += "z";
133  return mode;
134 }
135 
136 // string split by separator
137 inline std::vector<std::string> split_string(const std::string & s, const std::string & separators)
138 {
139  std::vector<std::string> ret;
140  bool is_seperator[256] = {false};
141  for(auto & ch : separators)
142  {
143  is_seperator[(unsigned int)ch] = true;
144  }
145  int begin = 0;
146  for(int i = 0; i <= (int)s.size(); i++)
147  {
148  if(is_seperator[(uint8_t)s[i]] || i == (int)s.size())
149  {
150  ret.push_back(std::string(s.begin() + begin, s.begin() + i));
151  begin = i + 1;
152  }
153  }
154  return ret;
155 }
156 
157 // deleter for htsFile
159 {
160  void operator()(htsFile * x)
161  {
162  if(x) hts_close(x);
163  }
164 };
165 
166 // deleter for hts iterator
168 {
169  void operator()(hts_itr_t * x)
170  {
171  if(x) hts_itr_destroy(x);
172  }
173 };
174 
175 // deleter for hts idx
177 {
178  void operator()(hts_idx_t * x)
179  {
180  if(x) hts_idx_destroy(x);
181  }
182 };
183 
184 // deleter for tabix idx
186 {
187  void operator()(tbx_t * x)
188  {
189  if(x) tbx_destroy(x);
190  }
191 };
192 
193 // deleter for variant
195 {
196  void operator()(bcf1_t * x)
197  {
198  if(x) bcf_destroy(x);
199  }
200 };
201 
202 // deleter for bcf header
204 {
205  void operator()(bcf_hdr_t * x)
206  {
207  if(x) bcf_hdr_destroy(x);
208  }
209 };
210 
211 } // namespace details
212 
219 {
220  friend class BcfRecord;
221  friend class BcfReader;
222  friend class BcfWriter;
223 
224  private:
225  bcf_hdr_t * hdr = NULL; // bcf header
226  bcf_hrec_t * hrec = NULL; // populate header
227 
228  public:
229  BcfHeader() {}
230 
231  ~BcfHeader()
232  {
233  if(hrec) bcf_hrec_destroy(hrec);
234  if(hdr) bcf_hdr_destroy(hdr);
235  }
236 
238  friend std::ostream & operator<<(std::ostream & out, const BcfHeader & h)
239  {
240  out << h.asString();
241  return out;
242  }
243 
244  // TODO: check if the value is valid for vcf specification
245 
252  inline void addINFO(const std::string & id,
253  const std::string & number,
254  const std::string & type,
255  const std::string & description)
256  {
257  addLine("##INFO=<ID=" + id + ",Number=" + number + ",Type=" + type + ",Description=\"" + description
258  + "\">");
259  }
260 
267  inline void addFORMAT(const std::string & id,
268  const std::string & number,
269  const std::string & type,
270  const std::string & description)
271  {
272  addLine("##FORMAT=<ID=" + id + ",Number=" + number + ",Type=" + type + ",Description=\"" + description
273  + "\">");
274  }
275 
281  inline void addFILTER(const std::string & id, const std::string & description)
282  {
283  addLine("##FILTER=<ID=" + id + ",Description=\"" + description + "\">");
284  }
285 
289  inline void addContig(const std::string & id)
290  {
291  addLine("##contig=<ID=" + id + ">");
292  }
293 
297  inline void addLine(const std::string & str)
298  {
299  int ret = 0;
300  ret = bcf_hdr_append(hdr, str.c_str());
301  if(ret != 0) throw std::runtime_error("could not add " + str + " to header\n");
302  ret = bcf_hdr_sync(hdr);
303  if(ret != 0) throw std::runtime_error("could not add " + str + " to header\n");
304  }
305 
307  inline void addSample(const std::string & sample) const
308  {
309  bcf_hdr_add_sample(hdr, sample.c_str());
310  if(bcf_hdr_sync(hdr) != 0)
311  {
312  throw std::runtime_error("couldn't add the sample.\n");
313  }
314  }
315 
317  inline std::string asString() const
318  {
319  kstring_t s = {0, 0, NULL}; // kstring
320  if(bcf_hdr_format(hdr, 0, &s) == 0) // append header string to s.s! append!
321  {
322  std::string out(s.s, s.l);
323  free(s.s);
324  return out;
325  }
326  else
327  {
328  throw std::runtime_error("failed to convert formatted header to string");
329  }
330  }
331 
333  std::vector<std::string> getSamples() const
334  {
335  std::vector<std::string> vec;
336  for(int i = 0; i < bcf_hdr_nsamples(hdr); i++)
337  {
338  vec.push_back(std::string(hdr->samples[i]));
339  }
340  return vec;
341  }
342 
344  std::vector<std::string> getSeqnames() const
345  {
346  int ret = 0;
347  const char ** seqs = bcf_hdr_seqnames(hdr, &ret);
348  if(ret == 0) printf("there is no contig id in the header!\n");
349  std::vector<std::string> vec;
350  for(int i = 0; i < ret; i++)
351  {
352  vec.push_back(std::string(seqs[i]));
353  }
354  free(seqs);
355  return vec;
356  }
357 
363  inline int getFormatType(std::string tag) const
364  {
365  int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag.c_str());
366  if(tag_id < 0) return 0;
367  if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
368  {
369  return 1;
370  }
371  else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
372  {
373  return 2;
374  }
375  else if(bcf_hdr_id2type(hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
376  {
377  return 3;
378  }
379  else
380  {
381  return 0;
382  }
383  }
384 
386  inline void removeContig(std::string tag) const
387  {
388  bcf_hdr_remove(hdr, BCF_HL_CTG, tag.c_str());
389  }
390 
392  inline void removeInfo(std::string tag) const
393  {
394  bcf_hdr_remove(hdr, BCF_HL_INFO, tag.c_str());
395  }
396 
398  inline void removeFormat(std::string tag) const
399  {
400  bcf_hdr_remove(hdr, BCF_HL_FMT, tag.c_str());
401  }
402 
404  inline void removeFilter(std::string tag) const
405  {
406  bcf_hdr_remove(hdr, BCF_HL_FLT, tag.c_str());
407  }
408 
414  void updateSamples(const std::string & samples)
415  {
416  auto ss = details::split_string(samples, ",");
417  const int nsamples = nSamples();
418  if(nsamples != (int)ss.size())
419  throw std::runtime_error("You provide either too few or too many samples");
420  kstring_t htxt = {0, 0, 0};
421  bcf_hdr_format(hdr, 1, &htxt);
422  // Find the beginning of the #CHROM line
423  int i = htxt.l - 2, ncols = 0;
424  while(i >= 0 && htxt.s[i] != '\n')
425  {
426  if(htxt.s[i] == '\t') ncols++;
427  i--;
428  }
429  if(i < 0 || strncmp(htxt.s + i + 1, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", 45))
430  {
431  if(i > 0 && !strncmp(htxt.s + i + 1, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", 38))
432  throw std::runtime_error("Error: missing FORMAT fields, cowardly refusing to add samples\n");
433  throw std::runtime_error("Could not parse the header: " + std::string(htxt.s));
434  }
435 
436  ncols = 0;
437  while(ncols != 9)
438  {
439  i++;
440  if(htxt.s[i] == '\t') ncols++;
441  }
442  htxt.l = i;
443 
444  // Replace all samples
445  for(i = 0; i < nsamples; i++)
446  {
447  kputc('\t', &htxt);
448  kputs(ss[i].c_str(), &htxt);
449  }
450  kputc('\n', &htxt);
451 
452  // destroy the old and make new header
453  bcf_hdr_destroy(hdr);
454  hdr = bcf_hdr_init("r");
455  if(bcf_hdr_parse(hdr, htxt.s) < 0)
456  throw std::runtime_error("An error occurred while parsing the header\n");
457  // free everything
458  free(htxt.s);
459  }
460 
465  inline void setSamples(const std::string & samples) const
466  {
467  int ret = 0;
468  ret = bcf_hdr_set_samples(hdr, samples.c_str(), 0);
469  if(ret != 0)
470  {
471  throw std::runtime_error("the " + std::to_string(ret)
472  + "-th sample are not in the VCF.\nparameter samples:" + samples);
473  }
474  }
475 
477  inline void setVersion(const std::string & version) const
478  {
479  bcf_hdr_set_version(hdr, version.c_str());
480  }
481 
483  inline int nSamples() const
484  {
485  return bcf_hdr_nsamples(hdr);
486  }
487 };
488 
496 {
497  friend class BcfReader;
498  friend class BcfWriter;
499 
500  private:
501  BcfHeader * header;
502  std::shared_ptr<bcf1_t> line = std::shared_ptr<bcf1_t>(bcf_init(), details::bcf_line_close()); // variant
503  bcf_hdr_t * hdr_d = NULL; // a dup header by bcf_hdr_dup(header->hdr)
504  bcf_fmt_t * fmt = NULL;
505  bcf_info_t * info = NULL;
506  int32_t * gts = NULL;
507  int ndst, ret, nsamples;
508  bool noneMissing = true; // whenever parsing a tag have to reset this variable
509  bool isAllPhased = false;
510  int nploidy = 0; // the number of ploidy
511  int nvalues = 0; // the number of values for a tag in FORMAT
512 
513  public:
515  std::vector<char> isGenoMissing;
516 
517  public:
520 
523  {
524  initHeader(h);
525  }
526 
527  ~BcfRecord()
528  {
529  if(gts) free(gts);
530  if(hdr_d) bcf_hdr_destroy(hdr_d);
531  }
532 
535  {
536  header = &h;
537  if(!header->hdr) throw std::runtime_error("please initial header first\n");
538  nsamples = header->nSamples();
539  typeOfGT.resize(nsamples);
540  gtPhase.resize(nsamples, 0);
541  }
542 
545  {
546  header = &h;
547  }
548 
550  friend std::ostream & operator<<(std::ostream & out, const BcfRecord & v)
551  {
552  out << v.asString();
553  return out;
554  }
555 
557  inline std::string asString() const
558  {
559  kstring_t s = {0, 0, NULL}; // kstring
560  if(vcf_format(header->hdr, line.get(), &s) == 0)
561  {
562  std::string out(s.s, s.l);
563  free(s.s);
564  return out;
565  }
566  else
567  {
568  throw std::runtime_error("couldn't format current record into a string.\n");
569  }
570  }
571 
580  template<typename T>
581  isValidGT<T> getGenotypes(T & v)
582  {
583  ndst = 0;
584  ret = bcf_get_genotypes(header->hdr, line.get(), &gts, &ndst);
585  if(ret <= 0)
586  {
587 # if defined(VERBOSE)
588  std::cerr << "GT not present for current site. did you initilize the variant object?\n";
589 # endif
590  return false;
591  }
592  // if nploidy is not set manually. find the max nploidy using the first variant (eg. 2) resize v as
593  // max(nploidy)
594  // * nsamples (ret) NOTE: if ret == nsamples and only one sample then haploid
595  if(ret != nploidy * nsamples && nploidy > 0)
596  {
597  // rare case if nploidy is set manually. eg. only one sample. the first variant is '1' but the
598  // second is '1/0'. ret = 1 but nploidy should be 2
599  v.resize(nploidy * nsamples);
600  }
601  else
602  {
603  v.resize(ret);
604  nploidy = ret / nsamples;
605  }
606  // work with nploidy == 1, haploid?
607  isGenoMissing.assign(nsamples, 0);
608  int i, j, nphased = 0;
609  noneMissing = true;
610  fmt = bcf_get_fmt(header->hdr, line.get(), "GT");
611  int nploidy_cur = ret / nsamples; // requires nploidy_cur <= nploidy
612  for(i = 0; i < nsamples; i++)
613  {
614  // check and fill in typeOfGT; only supports SNPs now. check vcfstats.c for inspiration
615  typeOfGT[i] = bcf_gt_type(fmt, i, NULL, NULL);
616  if(typeOfGT[i] == GT_UNKN)
617  {
618  noneMissing = false; // set missing as het, user should check if isNoneMissing();
619  isGenoMissing[i] = 1;
620  v[i * nploidy + 0] = 1;
621  for(j = 1; j < nploidy_cur; j++) v[i * nploidy + j] = 0;
622  continue;
623  }
624 
625  for(j = 0; j < nploidy_cur; j++)
626  {
627  // TODO: right now only parse 0 and 1, ie max(nploidy)=2; other values 2,3... will be set to 1
628  v[i * nploidy + j] = bcf_gt_allele(gts[j + i * nploidy_cur]) != 0;
629  }
630  if(nploidy == 2)
631  {
632  gtPhase[i] = (gts[1 + i * nploidy_cur] & 1) == 1;
633  nphased += gtPhase[i];
634  }
635  }
636  if(nphased == nsamples)
637  {
638  isAllPhased = true;
639  }
640  else
641  {
642  isAllPhased = false;
643  }
644  return true;
645  }
646 
654  bool getGenotypes(std::vector<int> & v)
655  {
656  ndst = 0;
657  ret = bcf_get_genotypes(header->hdr, line.get(), &gts, &ndst);
658  if(ret <= 0)
659  {
660 # if defined(VERBOSE)
661  std::cerr << "GT not present for current site. did you initilize the variant object?\n";
662 # endif
663  return false;
664  }
665  v.resize(ret);
666  isGenoMissing.assign(nsamples, 0);
667  nploidy = ret / nsamples;
668  int i, j, nphased = 0;
669  noneMissing = true;
670  for(i = 0; i < nsamples; i++)
671  {
672  int32_t * ptr = gts + i * nploidy;
673  int is_phased = 0;
674  for(j = 0; j < nploidy; j++)
675  {
676  // if true, the sample has smaller ploidy
677  if(ptr[j] == bcf_int32_vector_end) break;
678  // missing allele
679  if(bcf_gt_is_missing(ptr[j]))
680  {
681  noneMissing = false;
682  isGenoMissing[i] = 1;
683  v[i * nploidy + j] = -9;
684  continue;
685  }
686  v[i * nploidy + j] = bcf_gt_allele(ptr[j]);
687  is_phased += bcf_gt_is_phased(ptr[j]);
688  }
689  if(nploidy == is_phased)
690  {
691  gtPhase[i] = true;
692  nphased += gtPhase[i];
693  }
694  }
695  if(nphased == nsamples)
696  {
697  isAllPhased = true;
698  }
699  else
700  {
701  isAllPhased = false;
702  }
703  return true;
704  }
705 
712  template<typename T, typename S = typename T::value_type>
713  isFormatVector<T> getFORMAT(std::string tag, T & v)
714  {
715  fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str());
716  if(!fmt)
717  {
718  throw std::invalid_argument("no FORMAT=" + tag + " in the VCF header.\n");
719  }
720  nvalues = fmt->n;
721  ndst = 0;
722  S * dst = NULL;
723  int tagid = header->getFormatType(tag);
724  if(tagid == 1)
725  {
726  ret = bcf_get_format_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
727  }
728  else if(tagid == 2)
729  {
730  ret = bcf_get_format_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
731  }
732  else if(tagid == 3)
733  {
734  ret = bcf_get_format_char(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
735  }
736  else
737  {
738  ret = -1;
739  }
740 
741  if(ret >= 0)
742  {
743  // user have to check if there is missing in the return v;
744  v = std::vector<S>(dst, dst + ret);
745  free(dst);
746  return true;
747  }
748  else
749  {
750  free(dst);
751  return false;
752  }
753  }
754 
761  bool getFORMAT(std::string tag, std::vector<std::string> & v)
762  {
763  fmt = bcf_get_fmt(header->hdr, line.get(), tag.c_str());
764  if(!fmt)
765  {
766  throw std::invalid_argument("no FORMAT=" + tag + " in the VCF header.\n");
767  }
768  nvalues = fmt->n;
769  // if ndst < (fmt->n+1)*nsmpl; then realloc is involved
770  ret = -1, ndst = 0;
771  char ** dst = NULL;
772  if(header->getFormatType(tag) == 3)
773  ret = bcf_get_format_string(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
774  if(ret > 0)
775  {
776  v.clear();
777  for(int i = 0; i < nsamples; i++)
778  {
779  // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
780  v.emplace_back(dst[i]);
781  };
782  free(dst[0]);
783  free(dst);
784  return true;
785  }
786  else
787  {
788  free(dst[0]);
789  free(dst);
790  return false;
791  }
792  }
793 
800  template<typename T, typename S = typename T::value_type>
801  isInfoVector<T> getINFO(std::string tag, T & v)
802  {
803  info = bcf_get_info(header->hdr, line.get(), tag.c_str());
804  if(!info)
805  {
806  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
807  }
808  S * dst = NULL;
809  ndst = 0;
810  ret = -1;
811  if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
812  {
813  ret = bcf_get_info_int32(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
814  }
815  else if(info->type == BCF_BT_FLOAT)
816  {
817  ret = bcf_get_info_float(header->hdr, line.get(), tag.c_str(), &dst, &ndst);
818  }
819  if(ret >= 0)
820  {
821  v = std::vector<S>(dst, dst + ret); // user have to check if there is missing in the return v;
822  free(dst);
823  return true;
824  }
825  else
826  {
827  free(dst);
828  return false;
829  }
830  }
831 
838  template<typename T>
839  isScalar<T> getINFO(std::string tag, T & v)
840  {
841  info = bcf_get_info(header->hdr, line.get(), tag.c_str());
842  if(!info)
843  {
844  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
845  }
846  // scalar value
847  if(info->len == 1)
848  {
849  if(info->type == BCF_BT_INT8 || info->type == BCF_BT_INT16 || info->type == BCF_BT_INT32)
850  {
851  v = info->v1.i;
852  }
853  else if(info->type == BCF_BT_FLOAT)
854  {
855  v = info->v1.f;
856  }
857  return true;
858  }
859  else
860  {
861 # if defined(VERBOSE)
862  std::cerr << "there are multiple values for " + tag
863  + " in INFO for current site. please use vector instead\n";
864 # endif
865  return false;
866  }
867  }
868 
875  template<typename T>
876  isString<T> getINFO(std::string tag, T & v)
877  {
878  info = bcf_get_info(header->hdr, line.get(), tag.c_str());
879  if(!info)
880  {
881  throw std::invalid_argument("no INFO=" + tag + " in the VCF header.\n");
882  }
883  if(info->type == BCF_BT_CHAR)
884  {
885  v = std::string(reinterpret_cast<char *>(info->vptr), info->vptr_len);
886  return true;
887  }
888  else
889  {
890  return false;
891  }
892  }
893 
900  template<typename T>
901  isScalar<T> setINFO(std::string tag, const T & v)
902  {
903  // bcf_hrec_set_val
904  // bcf_update_info_flag(header.hdr, line, tag.c_str(), NULL, 1);
905  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
906  if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
907  {
908  ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), &v, 1);
909  }
910  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
911  {
912  float v2 = static_cast<float>(v);
913  ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), &v2, 1);
914  }
915  else
916  {
917  ret = -1;
918  }
919  if(ret < 0)
920  {
921 # if defined(VERBOSE)
922  std::cerr << "couldn't set " + tag + " for this variant.\nplease add the tag in headerfirst.\n";
923 # endif
924  return false;
925  }
926  return true;
927  }
928 
935  template<typename T>
936  isValidInfo<T> setINFO(std::string tag, const T & v)
937  {
938  // bcf_update_info_flag(header.hdr, line, tag.c_str(), NULL, 1);
939  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
940  if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
941  {
942  ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
943  }
944  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
945  {
946  ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
947  }
948  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
949  {
950  ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), v.data());
951  }
952  else
953  {
954  ret = -1;
955  }
956 
957  if(ret < 0)
958  {
959 # if defined(VERBOSE)
960  std::cerr << "couldn't set " + tag + " for this variant.\nplease add the tag in headerfirst.\n";
961 # endif
962  return false;
963  }
964  return true;
965  }
966 
968  void removeINFO(std::string tag)
969  {
970  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
971  if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_INT & 0xff))
972  {
973  ret = bcf_update_info_int32(header->hdr, line.get(), tag.c_str(), NULL, 0);
974  }
975  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_REAL & 0xff))
976  {
977  ret = bcf_update_info_float(header->hdr, line.get(), tag.c_str(), NULL, 0);
978  }
979  else if(bcf_hdr_id2type(header->hdr, BCF_HL_INFO, tag_id) == (BCF_HT_STR & 0xff))
980  {
981  ret = bcf_update_info_string(header->hdr, line.get(), tag.c_str(), NULL);
982  }
983  else
984  {
985  ret = -1;
986  }
987 
988  if(ret < 0)
989  {
990  throw std::runtime_error("couldn't remove " + tag + " for this variant.\n");
991  }
992  }
993 
999  bool setGenotypes(const std::vector<int> & v)
1000  {
1001  // bcf_gt_type
1002  int i, j, k;
1003  nploidy = v.size() / nsamples;
1004  int32_t * gt = (int32_t *)malloc(v.size() * sizeof(int32_t));
1005  for(i = 0; i < nsamples; i++)
1006  {
1007  for(j = 0; j < nploidy; j++)
1008  {
1009  k = i * nploidy + j;
1010  if(v[k] == -9 || v[k] == bcf_int32_missing)
1011  {
1012  gt[k] = bcf_gt_missing;
1013  }
1014  else if(gtPhase[i])
1015  {
1016  gt[k] = bcf_gt_phased(v[k]);
1017  }
1018  else
1019  {
1020  gt[k] = bcf_gt_unphased(v[k]);
1021  }
1022  }
1023  }
1024  if(bcf_update_genotypes(header->hdr, line.get(), gt, v.size()) < 0)
1025  {
1026  free(gt);
1027 # if defined(VERBOSE)
1028  std::cerr << "couldn't set genotypes correctly.\n";
1029 # endif
1030  return false;
1031  }
1032  free(gt);
1033  return true;
1034  }
1035 
1040  void setPhasing(const std::vector<char> & v)
1041  {
1042  if((int)v.size() != nsamples)
1043  throw std::runtime_error("the size of input vector is not matching the size of genotypes");
1044  gtPhase = v;
1045  }
1046 
1048  void removeFORMAT(std::string tag)
1049  {
1050  ret = -1;
1051  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
1052  if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1053  {
1054  ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), NULL, 0);
1055  }
1056  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
1057  {
1058  ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), NULL, 0);
1059  }
1060  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1061  {
1062  ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), NULL, 0);
1063  }
1064 
1065  if(ret < 0) throw std::runtime_error("couldn't remove " + tag + " correctly.\n");
1066  }
1067 
1074  template<typename T>
1075  isValidFMT<T> setFORMAT(std::string tag, const T & v)
1076  {
1077  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
1078  if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1079  {
1080  ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
1081  }
1082  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_STR & 0xff))
1083  {
1084  ret = bcf_update_format_char(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
1085  }
1086  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1087  {
1088  ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), v.data(), v.size());
1089  }
1090  else
1091  {
1092  ret = -1;
1093  }
1094 
1095  if(ret < 0)
1096  {
1097 # if defined(VERBOSE)
1098  std::cerr << "couldn't set format " + tag + " corectly.\n";
1099 # endif
1100  return false;
1101  }
1102  return true;
1103  }
1104 
1112  template<typename T>
1113  isScalar<T> setFORMAT(std::string tag, const T & v)
1114  {
1115  float v2 = v;
1116  int tag_id = bcf_hdr_id2int(header->hdr, BCF_DT_ID, tag.c_str());
1117  if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_INT & 0xff))
1118  {
1119  ret = bcf_update_format_int32(header->hdr, line.get(), tag.c_str(), &v, 1);
1120  }
1121  else if(bcf_hdr_id2type(header->hdr, BCF_HL_FMT, tag_id) == (BCF_HT_REAL & 0xff))
1122  {
1123  ret = bcf_update_format_float(header->hdr, line.get(), tag.c_str(), &v2, 1);
1124  }
1125  else
1126  {
1127  ret = -1;
1128  }
1129  if(ret < 0)
1130  {
1131 # if defined(VERBOSE)
1132  std::cerr << "couldn't set format " + tag + " corectly.\n";
1133 # endif
1134  return false;
1135  }
1136  return true;
1137  }
1138 
1140  void addLineFromString(const std::string & vcfline)
1141  {
1142  kstring_t s = {0, 0, NULL};
1143  kputsn(vcfline.c_str(), vcfline.length(), &s);
1144  ret = vcf_parse1(&s, header->hdr, line.get());
1145  free(s.s);
1146  if(ret > 0) throw std::runtime_error("error parsing: " + vcfline + "\n");
1147  if(line->errcode == BCF_ERR_CTG_UNDEF)
1148  {
1149  std::string contig(bcf_hdr_id2name(header->hdr, line->rid));
1150  hdr_d = bcf_hdr_dup(header->hdr);
1151  header->hrec = bcf_hdr_id2hrec(hdr_d, BCF_DT_CTG, 0, line->rid);
1152  if(header->hrec == NULL)
1153  throw std::runtime_error("contig" + contig + " unknow and not found in the header.\n");
1154  ret = bcf_hdr_add_hrec(header->hdr, header->hrec);
1155  if(ret < 0) throw std::runtime_error("error adding contig " + contig + " to header.\n");
1156  ret = bcf_hdr_sync(header->hdr);
1157  }
1158  }
1159 
1161  inline bool isNoneMissing() const
1162  {
1163  return noneMissing;
1164  }
1165 
1167  inline bool isSV() const
1168  {
1169  if(bcf_get_info(header->hdr, line.get(), "SVTYPE") == NULL)
1170  {
1171  return false;
1172  }
1173  else
1174  {
1175  return true;
1176  }
1177  }
1178 
1180  inline bool isIndel() const
1181  {
1182  // REF has multiple allels
1183  if(REF().length() > 1 && !isSV()) return true;
1184  for(int i = 1; i < line->n_allele; i++)
1185  {
1186  std::string alt(line->d.allele[i]);
1187  if(alt == ".") return true;
1188  if(alt.length() != REF().length() && !isSV()) return true;
1189  }
1190  return false;
1191  }
1192 
1194  inline bool isMultiAllelics() const
1195  {
1196  if(line->n_allele <= 2) return false;
1197  return true;
1198  }
1199 
1201  inline bool isMultiAllelicSNP() const
1202  {
1203  // skip REF with length > 1, i.e. INDEL
1204  if(REF().length() > 1 || line->n_allele <= 2) return false;
1205  for(int i = 1; i < line->n_allele; i++)
1206  {
1207  std::string snp(line->d.allele[i]);
1208  if(snp.length() != 1)
1209  {
1210  return false;
1211  }
1212  }
1213  return true;
1214  }
1215 
1218  inline bool isSNP() const
1219  {
1220  // REF and ALT have multiple allels
1221  if(REF().length() > 1 || line->n_allele > 2) return false;
1222  std::string snp(line->d.allele[1]);
1223  if(!(snp == "A" || snp == "C" || snp == "G" || snp == "T"))
1224  {
1225  return false;
1226  }
1227  return true;
1228  }
1229 
1232  inline bool hasSNP() const
1233  {
1234  int type = bcf_has_variant_types(line.get(), VCF_SNP, bcf_match_overlap);
1235  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1236  if(type == 0) return false;
1237  return true;
1238  }
1239 
1241  inline bool hasINDEL() const
1242  {
1243  int type = bcf_has_variant_types(line.get(), VCF_INDEL, bcf_match_overlap);
1244  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1245  if(type == 0) return false;
1246  return true;
1247  }
1248 
1250  inline bool hasINS() const
1251  {
1252  int type = bcf_has_variant_types(line.get(), VCF_INS, bcf_match_overlap);
1253  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1254  if(type == 0) return false;
1255  return true;
1256  }
1257 
1259  inline bool hasDEL() const
1260  {
1261  int type = bcf_has_variant_types(line.get(), VCF_DEL, bcf_match_overlap);
1262  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1263  if(type == 0) return false;
1264  return true;
1265  }
1266 
1268  inline bool hasMNP() const
1269  {
1270  int type = bcf_has_variant_types(line.get(), VCF_MNP, bcf_match_overlap);
1271  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1272  if(type == 0) return false;
1273  return true;
1274  }
1275 
1277  inline bool hasBND() const
1278  {
1279  int type = bcf_has_variant_types(line.get(), VCF_BND, bcf_match_overlap);
1280  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1281  if(type == 0) return false;
1282  return true;
1283  }
1284 
1286  inline bool hasOTHER() const
1287  {
1288  int type = bcf_has_variant_types(line.get(), VCF_OTHER, bcf_match_overlap);
1289  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1290  if(type == 0) return false;
1291  return true;
1292  }
1293 
1295  inline bool hasOVERLAP() const
1296  {
1297  int type = bcf_has_variant_types(line.get(), VCF_OVERLAP, bcf_match_overlap);
1298  if(type < 0) throw std::runtime_error("something wrong with variant type\n");
1299  if(type == 0) return false;
1300  return true;
1301  }
1302 
1304  inline std::string CHROM() const
1305  {
1306  return std::string(bcf_hdr_id2name(header->hdr, line->rid));
1307  }
1308 
1310  inline std::string ID() const
1311  {
1312  return std::string(line->d.id);
1313  }
1314 
1316  inline int64_t POS() const
1317  {
1318  return line->pos + 1;
1319  }
1320 
1322  inline void setCHR(const std::string & s)
1323  {
1324  line->rid = bcf_hdr_name2id(header->hdr, s.c_str());
1325  }
1326 
1328  inline void setPOS(int64_t p)
1329  {
1330  line->pos = p - 1;
1331  }
1332 
1334  inline void setID(const std::string & s)
1335  {
1336  bcf_update_id(header->hdr, line.get(), s.c_str());
1337  }
1338 
1340  inline void setRefAlt(const std::string & s)
1341  {
1342  bcf_update_alleles_str(header->hdr, line.get(), s.c_str());
1343  }
1344 
1346  inline void setQUAL(float q)
1347  {
1348  line->qual = q;
1349  }
1350 
1352  inline void setQUAL(char q)
1353  {
1354  bcf_float_set_missing(line->qual);
1355  }
1356 
1358  inline void setFILTER(const std::string & s)
1359  {
1360  int32_t tmpi = bcf_hdr_id2int(header->hdr, BCF_DT_ID, s.c_str());
1361  bcf_add_filter(header->hdr, line.get(), tmpi);
1362  }
1363 
1365  inline int64_t Start() const
1366  {
1367  return line->pos;
1368  }
1369 
1371  inline int64_t End() const
1372  {
1373  return line->pos + line->rlen;
1374  }
1375 
1377  inline std::string REF() const
1378  {
1379  return std::string(line->d.allele[0]);
1380  }
1381 
1383  inline void swap_REF_ALT()
1384  {
1385  if(!isMultiAllelicSNP()) std::swap(line->d.allele[0], line->d.allele[1]);
1386  }
1387 
1389  inline std::string ALT() const
1390  {
1391  std::string s;
1392  for(int i = 1; i < line->n_allele; i++)
1393  {
1394  s += std::string(line->d.allele[i]) + ",";
1395  }
1396  if(s.length() > 1) s.pop_back();
1397  return s;
1398  }
1399 
1401  inline float QUAL()
1402  {
1403  if(bcf_float_is_missing(line->qual))
1404  {
1405  noneMissing = false;
1406  return bcf_float_missing;
1407  }
1408  else
1409  {
1410  return line->qual;
1411  }
1412  }
1413 
1415  inline std::string FILTER()
1416  {
1417  if(line->d.n_flt == 0)
1418  {
1419  return ".";
1420  }
1421  else if(line->d.n_flt == 1)
1422  {
1423  return std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[0]));
1424  }
1425  else
1426  {
1427  std::string s;
1428  for(int i = 1; i < line->d.n_flt; i++)
1429  {
1430  s += std::string(bcf_hdr_int2id(header->hdr, BCF_DT_ID, line->d.flt[i])) + ",";
1431  }
1432  s.pop_back();
1433  return s;
1434  }
1435  }
1436 
1438  inline std::string allINFO()
1439  {
1440  int32_t max_dt_id = header->hdr->n[BCF_DT_ID];
1441  kstring_t * s = (kstring_t *)calloc(1, sizeof(kstring_t));
1442  if(line->n_info)
1443  {
1444  int first = 1;
1445  for(int i = 0; i < line->n_info; ++i)
1446  {
1447  bcf_info_t * z = &line->d.info[i];
1448  if(!z->vptr) continue;
1449  if(!first) kputc(';', s);
1450  first = 0;
1451  if(z->key < 0 || z->key >= max_dt_id || header->hdr->id[BCF_DT_ID][z->key].key == NULL)
1452  throw std::runtime_error("Invalid BCF and wrong INFO tag");
1453  kputs(header->hdr->id[BCF_DT_ID][z->key].key, s);
1454  if(z->len <= 0) continue;
1455  kputc('=', s);
1456  if(z->len == 1)
1457  {
1458  switch(z->type)
1459  {
1460  case BCF_BT_INT8:
1461  if(z->v1.i == bcf_int8_missing)
1462  kputc('.', s);
1463  else
1464  kputw(z->v1.i, s);
1465  break;
1466  case BCF_BT_INT16:
1467  if(z->v1.i == bcf_int16_missing)
1468  kputc('.', s);
1469  else
1470  kputw(z->v1.i, s);
1471  break;
1472  case BCF_BT_INT32:
1473  if(z->v1.i == bcf_int32_missing)
1474  kputc('.', s);
1475  else
1476  kputw(z->v1.i, s);
1477  break;
1478  case BCF_BT_INT64:
1479  if(z->v1.i == bcf_int64_missing)
1480  kputc('.', s);
1481  else
1482  kputll(z->v1.i, s);
1483  break;
1484  case BCF_BT_FLOAT:
1485  if(bcf_float_is_missing(z->v1.f))
1486  kputc('.', s);
1487  else
1488  kputd(z->v1.f, s);
1489  break;
1490  case BCF_BT_CHAR:
1491  kputc(z->v1.i, s);
1492  break;
1493  default:
1494  throw std::runtime_error("Unexpected type in INFO");
1495  }
1496  }
1497  else
1498  bcf_fmt_array(s, z->len, z->type, z->vptr);
1499  }
1500  if(first) kputc('.', s);
1501  }
1502  else
1503  kputc('.', s);
1504  std::string out = std::string(s->s, s->l);
1505  free(s->s);
1506  free(s);
1507  return out;
1508  }
1509 
1511  inline bool allPhased() const
1512  {
1513  return isAllPhased;
1514  }
1515 
1517  inline int ploidy() const
1518  {
1519  return nploidy;
1520  }
1521 
1523  inline void setPloidy(int v)
1524  {
1525  nploidy = v;
1526  }
1527 
1539  std::vector<char> typeOfGT;
1540 
1542  std::vector<char> gtPhase;
1543 };
1544 
1550 {
1551  private:
1552  std::shared_ptr<htsFile> fp; // hts file
1553  std::shared_ptr<hts_idx_t> hidx; // hts index file
1554  std::shared_ptr<tbx_t> tidx; // .tbi .csi index file for vcf files
1555  std::shared_ptr<hts_itr_t> itr; // hts iterator
1556  kstring_t s = {0, 0, NULL}; // kstring
1557  std::string fname;
1558  bool isBcf; // if the input file is bcf or vcf;
1559 
1560  public:
1566  std::vector<std::string> SamplesName;
1567 
1570 
1575  BcfReader(const std::string & file) : fname(file)
1576  {
1577  open(file);
1578  }
1579 
1585  BcfReader(const std::string & file, const std::string & region) : fname(file)
1586  {
1587  open(file);
1588  if(!region.empty()) setRegion(region);
1590  }
1591 
1602  BcfReader(const std::string & file, const std::string & region, const std::string & samples) : fname(file)
1603  {
1604  open(file);
1605  if(!region.empty()) setRegion(region);
1606  if(!samples.empty()) setSamples(samples);
1607  }
1608 
1609  ~BcfReader()
1610  {
1611  if(s.s) free(s.s);
1612  }
1613 
1615  void close()
1616  {
1617  if(fp) fp.reset();
1618  if(hidx) hidx.reset();
1619  if(itr) itr.reset();
1620  if(tidx) tidx.reset();
1621  }
1622 
1624  void open(const std::string & file)
1625  {
1626  if(!fname.empty() && fname != file)
1627  throw std::runtime_error("does not support re-open a file yet. " + fname);
1628  fname = file;
1629  fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), "r"), details::hts_file_close());
1630  if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
1631  header.hdr = bcf_hdr_read(fp.get());
1632  nsamples = bcf_hdr_nsamples(header.hdr);
1634  }
1635 
1637  inline int setThreads(int n)
1638  {
1639  return hts_set_threads(fp.get(), n);
1640  }
1641 
1643  const BcfHeader & getHeader() const
1644  {
1645  return header;
1646  }
1647 
1656  int getStatus(const std::string & region)
1657  {
1658  try
1659  {
1660  setRegion(region);
1661  BcfRecord v(header);
1662  if(!getNextVariant(v)) return 0;
1663  }
1664  catch(const std::invalid_argument & e)
1665  {
1666  return -1;
1667  }
1668  catch(const std::runtime_error & e)
1669  {
1670  return -2;
1671  }
1672  return 1;
1673  }
1674 
1680  int getVariantsCount(const std::string & region)
1681  {
1682  int c{0};
1683  setRegion(region);
1684  BcfRecord v(header);
1685  while(getNextVariant(v)) c++;
1686  return c;
1687  }
1688 
1694  void setSamples(const std::string & samples)
1695  {
1696  header.setSamples(samples);
1697  nsamples = bcf_hdr_nsamples(header.hdr);
1699  }
1700 
1708  void setRegion(const std::string & region)
1709  {
1710  // 1. check and load index first
1711  // 2. query iterval region
1712  // 3. if region is empty, use "."
1713  if(details::isEndWith(fname, "bcf") || details::isEndWith(fname, "bcf.gz"))
1714  {
1715  isBcf = true;
1716  hidx = std::shared_ptr<hts_idx_t>(bcf_index_load(fname.c_str()), details::hts_idx_close());
1717  if(itr) itr.reset(); // reset current region.
1718  if(region.empty())
1719  itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(), header.hdr, "."),
1721  else
1722  itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(), header.hdr, region.c_str()),
1724  }
1725  else
1726  {
1727  isBcf = false;
1728  tidx = std::shared_ptr<tbx_t>(tbx_index_load(fname.c_str()), details::tabix_idx_close());
1729  if(tidx.get() == NULL) throw std::invalid_argument(" no tabix index found!");
1730  if(itr) itr.reset(); // reset
1731  if(region.empty())
1732  itr = std::shared_ptr<hts_itr_t>(tbx_itr_querys(tidx.get(), "."), details::hts_iter_close());
1733  else
1734  itr = std::shared_ptr<hts_itr_t>(tbx_itr_querys(tidx.get(), region.c_str()),
1736  }
1737  if(itr.get() == NULL)
1738  throw std::runtime_error("region was not found! make sure the region format is correct");
1739  }
1740 
1744  {
1745  int ret = -1;
1746  if(itr.get() != NULL)
1747  {
1748  if(isBcf)
1749  {
1750  ret = bcf_itr_next(fp.get(), itr.get(), r.line.get());
1751  bcf_unpack(r.line.get(), BCF_UN_ALL);
1752  return (ret >= 0);
1753  }
1754  else
1755  {
1756  int slen = tbx_itr_next(fp.get(), tidx.get(), itr.get(), &s);
1757  if(slen > 0)
1758  {
1759  ret = vcf_parse1(&s, r.header->hdr, r.line.get()); // ret > 0, error
1760  bcf_unpack(r.line.get(), BCF_UN_ALL);
1761  }
1762  return (ret <= 0) && (slen > 0);
1763  }
1764  }
1765  else
1766  {
1767  ret = bcf_read(fp.get(), r.header->hdr, r.line.get());
1768  // unpack record immediately. not lazy
1769  bcf_unpack(r.line.get(), BCF_UN_ALL);
1770  return (ret == 0);
1771  }
1772  }
1773 };
1774 
1780 {
1781  private:
1782  std::shared_ptr<htsFile> fp; // hts file
1783  std::shared_ptr<bcf1_t> b = std::shared_ptr<bcf1_t>(bcf_init(), details::bcf_line_close()); // variant
1784  int ret;
1785  bool isHeaderWritten = false;
1786  const BcfHeader * hp;
1787 
1788  public:
1791 
1794 
1800  BcfWriter(const std::string & fname, std::string version = "VCF4.1")
1801  {
1802  open(fname);
1803  initalHeader(version);
1805  }
1806 
1812  BcfWriter(const std::string & fname, const BcfHeader & h)
1813  {
1814  open(fname);
1815  initalHeader(h);
1816  }
1817 
1828  BcfWriter(const std::string & fname, const std::string & version, const std::string & mode)
1829  {
1830  open(fname, mode);
1831  initalHeader(version);
1833  }
1834 
1845  BcfWriter(const std::string & fname, const BcfHeader & h, const std::string & mode)
1846  {
1847  open(fname, mode);
1848  initalHeader(h);
1849  }
1850 
1851  ~BcfWriter() {}
1852 
1857  void open(const std::string & fname)
1858  {
1859  auto mode = details::getMode(fname, "w");
1860  fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close());
1861  if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
1862  }
1863 
1873  void open(const std::string & fname, const std::string & mode)
1874  {
1875  fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close());
1876  if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
1877  }
1878 
1880  void close()
1881  {
1882  if(!isHeaderWritten) writeHeader();
1883  if(b) b.reset();
1884  if(fp) fp.reset();
1885  }
1886 
1888  void initalHeader(std::string version = "VCF4.1")
1889  {
1890  header.hdr = bcf_hdr_init("w");
1891  header.setVersion(version);
1892  }
1893 
1895  void initalHeader(const BcfHeader & h)
1896  {
1897  hp = &h;
1898  }
1899 
1901  void copyHeader(const std::string & vcffile, std::string samples = "-")
1902  {
1903  htsFile * fp2 = hts_open(vcffile.c_str(), "r");
1904  if(!fp2) throw std::invalid_argument("I/O error: input file is invalid");
1905  if(samples == "")
1906  { // site-only
1907  bcf_hdr_t * hfull = bcf_hdr_read(fp2);
1908  header.hdr = bcf_hdr_subset(hfull, 0, 0, 0);
1909  bcf_hdr_remove(header.hdr, BCF_HL_FMT, NULL);
1910  bcf_hdr_destroy(hfull);
1911  }
1912  else
1913  {
1914  header.hdr = bcf_hdr_read(fp2);
1915  header.setSamples(samples);
1916  }
1917  hts_close(fp2);
1919  }
1920 
1922  void writeLine(const std::string & vcfline)
1923  {
1924  if(!isHeaderWritten && !writeHeader()) throw std::runtime_error("could not write header\n");
1925  kstring_t s = {0, 0, NULL};
1926  kputsn(vcfline.c_str(), vcfline.length(), &s);
1927  ret = vcf_parse1(&s, hp->hdr, b.get());
1928  free(s.s);
1929  if(ret > 0) throw std::runtime_error("error parsing: " + vcfline + "\n");
1930  if(b->errcode == BCF_ERR_CTG_UNDEF)
1931  {
1932  throw std::runtime_error("contig id " + (std::string)bcf_hdr_id2name(hp->hdr, b->rid)
1933  + " not found in the header. please run header->AddContig() first.\n");
1934  }
1935  ret = bcf_write(fp.get(), hp->hdr, b.get());
1936  if(ret != 0) throw std::runtime_error("error writing: " + vcfline + "\n");
1937  }
1938 
1941  {
1942  ret = bcf_hdr_write(fp.get(), hp->hdr);
1943  if(ret == 0) return isHeaderWritten = true;
1944  return false;
1945  }
1946 
1949  {
1950  if(!isHeaderWritten) writeHeader();
1951  if(bcf_write(fp.get(), v.header->hdr, v.line.get()) < 0) return false;
1952  return true;
1953  }
1954 };
1955 
1956 } // namespace vcfpp
1957 
1958 #endif // VCFPP_H_
Object represents header of the VCF, offering methods to access and modify the tags.
Definition: vcfpp.h:219
std::string asString() const
return header as a string
Definition: vcfpp.h:317
void removeFormat(std::string tag) const
remove a FORMAT tag from header
Definition: vcfpp.h:398
void addContig(const std::string &id)
add contig to header
Definition: vcfpp.h:289
void addFILTER(const std::string &id, const std::string &description)
add one FILTER line to header
Definition: vcfpp.h:281
int getFormatType(std::string tag) const
get the type of a given tag
Definition: vcfpp.h:363
void addSample(const std::string &sample) const
add one sample name to header
Definition: vcfpp.h:307
void addFORMAT(const std::string &id, const std::string &number, const std::string &type, const std::string &description)
add FORMAT field to header
Definition: vcfpp.h:267
void setVersion(const std::string &version) const
set the VCF version
Definition: vcfpp.h:477
void removeInfo(std::string tag) const
remove a INFO tag from header
Definition: vcfpp.h:392
void setSamples(const std::string &samples) const
explicitly set samples to be extracted
Definition: vcfpp.h:465
void addLine(const std::string &str)
add one line to header
Definition: vcfpp.h:297
std::vector< std::string > getSeqnames() const
return all contig/chromosome names in a string vector
Definition: vcfpp.h:344
void removeContig(std::string tag) const
remove a contig tag from header
Definition: vcfpp.h:386
std::vector< std::string > getSamples() const
return all samples in a string vector
Definition: vcfpp.h:333
int nSamples() const
return the number of samples in the VCF
Definition: vcfpp.h:483
friend std::ostream & operator<<(std::ostream &out, const BcfHeader &h)
stream out the header
Definition: vcfpp.h:238
void removeFilter(std::string tag) const
remove a FILTER tag from header
Definition: vcfpp.h:404
void updateSamples(const std::string &samples)
update the sample id in the header
Definition: vcfpp.h:414
void addINFO(const std::string &id, const std::string &number, const std::string &type, const std::string &description)
add INFO field to header
Definition: vcfpp.h:252
Stream in variants from compressed/uncompressed VCF/BCF file or stdin.
Definition: vcfpp.h:1550
void setSamples(const std::string &samples)
explicitly stream to specific samples
Definition: vcfpp.h:1694
void open(const std::string &file)
Open a VCF/BCF/STDIN file for streaming in.
Definition: vcfpp.h:1624
BcfReader(const std::string &file, const std::string &region, const std::string &samples)
construct a vcf/bcf reader with subset samples in target region
Definition: vcfpp.h:1602
BcfHeader header
a BcfHeader object
Definition: vcfpp.h:1562
int setThreads(int n)
set the number of threads to use
Definition: vcfpp.h:1637
BcfReader(const std::string &file, const std::string &region)
construct a vcf/bcf reader with subset samples
Definition: vcfpp.h:1585
bool getNextVariant(BcfRecord &r)
read in the next variant
Definition: vcfpp.h:1743
BcfReader()
Construct an empty BcfReader.
Definition: vcfpp.h:1569
int nsamples
number of samples in the VCF
Definition: vcfpp.h:1564
const BcfHeader & getHeader() const
return a BcfHeader object
Definition: vcfpp.h:1643
BcfReader(const std::string &file)
construct a vcf/bcf reader from file.
Definition: vcfpp.h:1575
void close()
Close the VCF file and its associated files.
Definition: vcfpp.h:1615
void setRegion(const std::string &region)
explicitly stream to specific region. throw invalid_argument error if index file not found....
Definition: vcfpp.h:1708
int getVariantsCount(const std::string &region)
count the number of variants by iterating through a given region.
Definition: vcfpp.h:1680
int getStatus(const std::string &region)
query the status of a given region in the VCF
Definition: vcfpp.h:1656
std::vector< std::string > SamplesName
a vector of samples name in the VCF
Definition: vcfpp.h:1566
Object represents a variant record in the VCF, offering methods to access and modify fields.
Definition: vcfpp.h:496
float QUAL()
return QUAL value
Definition: vcfpp.h:1401
bool isIndel() const
return boolean value indicates if current variant is exclusively INDEL
Definition: vcfpp.h:1180
bool hasINDEL() const
return boolean value indicates if current variant has INDEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1241
void setRefAlt(const std::string &s)
set REF and ALT alleles given a string seperated by comma
Definition: vcfpp.h:1340
isValidGT< T > getGenotypes(T &v)
fill in the input vector with genotypes of 0 and 1. only works for ploidy<=2. Genotypes with missing ...
Definition: vcfpp.h:581
void setPOS(int64_t p)
modify position given 1-based value
Definition: vcfpp.h:1328
bool hasINS() const
return boolean value indicates if current variant has INS type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1250
BcfRecord()
empty constructor. call init() afterwards
Definition: vcfpp.h:519
std::string allINFO()
return raw INFO column as string. recommend to use getINFO for specific tag.
Definition: vcfpp.h:1438
int ploidy() const
return the number of ploidy of current variant
Definition: vcfpp.h:1517
void removeINFO(std::string tag)
remove the given tag from INFO of the variant
Definition: vcfpp.h:968
BcfRecord(BcfHeader &h)
constructor with a given BcfHeader object
Definition: vcfpp.h:522
std::string CHROM() const
return CHROM name
Definition: vcfpp.h:1304
void setID(const std::string &s)
update ID
Definition: vcfpp.h:1334
isValidFMT< T > setFORMAT(std::string tag, const T &v)
set tag values for all samples in FORMAT using given vector
Definition: vcfpp.h:1075
int64_t POS() const
return 1-base position
Definition: vcfpp.h:1316
std::string ID() const
return ID field
Definition: vcfpp.h:1310
isScalar< T > setFORMAT(std::string tag, const T &v)
set tag for a single sample in FORMAT using given singular value. this works only when there is one s...
Definition: vcfpp.h:1113
bool isSV() const
return boolean value indicates if current variant is Structual Variant or not
Definition: vcfpp.h:1167
void setQUAL(float q)
modify the QUAL value
Definition: vcfpp.h:1346
bool hasSNP() const
return boolean value indicates if current variant has SNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1232
std::vector< char > isGenoMissing
if there is "." in GT for the sample, then it's coded as missing (TRUE)
Definition: vcfpp.h:515
std::string FILTER()
return raw FILTER column as string
Definition: vcfpp.h:1415
void swap_REF_ALT()
swap REF and ALT for biallelic SNP
Definition: vcfpp.h:1383
std::string REF() const
return raw REF alleles as string
Definition: vcfpp.h:1377
std::string asString() const
return current variant as raw string
Definition: vcfpp.h:557
int64_t Start() const
return 0-base start of the variant (can be any type)
Definition: vcfpp.h:1365
bool hasBND() const
return boolean value indicates if current variant has BND type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1277
friend std::ostream & operator<<(std::ostream &out, const BcfRecord &v)
stream out the variant
Definition: vcfpp.h:550
isScalar< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:901
void resetHeader(BcfHeader &h)
reset the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:544
bool hasMNP() const
return boolean value indicates if current variant has MNP type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1268
bool allPhased() const
return boolean value indicates if genotypes of all samples are phased
Definition: vcfpp.h:1511
bool hasDEL() const
return boolean value indicates if current variant has DEL type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1259
bool isMultiAllelicSNP() const
return boolean value indicates if current variant is exclusively multiallelic SNP sites
Definition: vcfpp.h:1201
void setPhasing(const std::vector< char > &v)
set phasing status for all diploid samples using given vector
Definition: vcfpp.h:1040
bool setGenotypes(const std::vector< int > &v)
set genotypes from scratch even if genotypes not present
Definition: vcfpp.h:999
std::string ALT() const
return raw ALT alleles as string
Definition: vcfpp.h:1389
bool hasOTHER() const
return boolean value indicates if current variant has OTHER type defined in vcf.h (htslib>=1....
Definition: vcfpp.h:1286
isString< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:876
bool isSNP() const
return boolean value indicates if current variant is exclusively biallelic SNP. Note ALT=* are skippe...
Definition: vcfpp.h:1218
bool isMultiAllelics() const
return boolean value indicates if current variant is multiallelic sites
Definition: vcfpp.h:1194
void setCHR(const std::string &s)
modify CHROM value
Definition: vcfpp.h:1322
int64_t End() const
return 0-base end of the variant (can be any type)
Definition: vcfpp.h:1371
void setPloidy(int v)
in a rare case, one may want to set the number of ploidy manually
Definition: vcfpp.h:1523
isFormatVector< T > getFORMAT(std::string tag, T &v)
get tag value in FORMAT
Definition: vcfpp.h:713
void initHeader(BcfHeader &h)
initilize the header associated with BcfRecord object by pointing to another BcfHeader object
Definition: vcfpp.h:534
void addLineFromString(const std::string &vcfline)
add one variant record from given string
Definition: vcfpp.h:1140
bool getFORMAT(std::string tag, std::vector< std::string > &v)
get tag value in FORMAT
Definition: vcfpp.h:761
bool isNoneMissing() const
if all samples have non missing values for the tag in FORMAT
Definition: vcfpp.h:1161
void setQUAL(char q)
modify the QUAL value
Definition: vcfpp.h:1352
void setFILTER(const std::string &s)
modify the FILTER value
Definition: vcfpp.h:1358
isValidInfo< T > setINFO(std::string tag, const T &v)
set tag value for INFO
Definition: vcfpp.h:936
std::vector< char > gtPhase
vector of nsamples length. keep track of the phasing status of each sample
Definition: vcfpp.h:1542
void removeFORMAT(std::string tag)
remove the given tag from FORMAT of the variant
Definition: vcfpp.h:1048
std::vector< char > typeOfGT
vector of nsamples length. keep track of the type of genotype (one of GT_HOM_RR, GT_HET_RA,...
Definition: vcfpp.h:1539
isInfoVector< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:801
bool getGenotypes(std::vector< int > &v)
fill in the input vector with genotype values, 0, 1 or -9 (missing).
Definition: vcfpp.h:654
bool hasOVERLAP() const
return boolean value indicates if current variant has OVERLAP type defined in vcf....
Definition: vcfpp.h:1295
isScalar< T > getINFO(std::string tag, T &v)
get tag value in INFO
Definition: vcfpp.h:839
Stream out variants to compressed/uncompressed VCF/BCF file or stdout.
Definition: vcfpp.h:1780
void close()
close the BcfWriter object.
Definition: vcfpp.h:1880
void writeLine(const std::string &vcfline)
copy a string to a vcf line
Definition: vcfpp.h:1922
BcfWriter(const std::string &fname, const std::string &version, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1828
void copyHeader(const std::string &vcffile, std::string samples="-")
copy header of given VCF and restrict on samples. if samples=="", FORMAT removed and only sites left
Definition: vcfpp.h:1901
BcfWriter(const std::string &fname, std::string version="VCF4.1")
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1800
void open(const std::string &fname)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1857
BcfWriter(const std::string &fname, const BcfHeader &h)
Open VCF/BCF file for writing. The format is infered from file's suffix.
Definition: vcfpp.h:1812
void initalHeader(const BcfHeader &h)
initial a VCF header by pointing to header of another VCF
Definition: vcfpp.h:1895
BcfWriter(const std::string &fname, const BcfHeader &h, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1845
void initalHeader(std::string version="VCF4.1")
initial a VCF header using the internal template given a specific version. VCF4.1 is the default
Definition: vcfpp.h:1888
BcfHeader header
header object initialized by initalHeader
Definition: vcfpp.h:1790
BcfWriter()
Construct an empty BcfWriter.
Definition: vcfpp.h:1793
void open(const std::string &fname, const std::string &mode)
Open VCF/BCF file for writing using given mode.
Definition: vcfpp.h:1873
bool writeRecord(BcfRecord &v)
streams out the given variant of BcfRecord type
Definition: vcfpp.h:1948
bool writeHeader()
streams out the header
Definition: vcfpp.h:1940
Definition: vcfpp.h:204
Definition: vcfpp.h:195
Definition: vcfpp.h:159
Definition: vcfpp.h:177
Definition: vcfpp.h:168
Definition: vcfpp.h:186