qcLFrDFb

· 7 years ago · Nov 24, 2018, 02:28 PM
1//
2// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
3// quasi-mapping.
4// Copyright (C) 2015, 2016, 2017 Rob Patro, Avi Srivastava, Hirak Sarkar
5//
6// This file is part of RapMap.
7//
8// RapMap is free software: you can redistribute it and/or modify
9// it under the terms of the GNU General Public License as published by
10// the Free Software Foundation, either version 3 of the License, or
11// (at your option) any later version.
12//
13// RapMap is distributed in the hope that it will be useful,
14// but WITHOUT ANY WARRANTY; without even the implied warranty of
15// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16// GNU General Public License for more details.
17//
18// You should have received a copy of the GNU General Public License
19// along with RapMap.  If not, see <http://www.gnu.org/licenses/>.
20//
21
22#include <algorithm>
23#include <cctype>
24#include <cstdio>
25#include <fstream>
26#include <iostream>
27#include <iterator>
28#include <memory>
29#include <mutex>
30#include <random>
31#include <type_traits>
32#include <unordered_map>
33#include <vector>
34#include <string.h>
35
36#include "tclap/CmdLine.h"
37
38#include <cereal/archives/binary.hpp>
39#include <cereal/archives/json.hpp>
40#include <cereal/types/string.hpp>
41#include <cereal/types/unordered_map.hpp>
42#include <cereal/types/utility.hpp>
43#include <cereal/types/vector.hpp>
44
45#include "BooMap.hpp"
46#include "FrugalBooMap.hpp"
47#include "xxhash.h"
48
49#include "spdlog/spdlog.h"
50
51#include "FastxParser.hpp"
52// Jellyfish 2 include
53#include "jellyfish/mer_dna.hpp"
54
55#include "divsufsort.h"
56#include "divsufsort64.h"
57
58#include "RapMapFileSystem.hpp"
59#include "RapMapUtils.hpp"
60#include "ScopedTimer.hpp"
61#include "bit_array.h"
62
63#include "JFRaw.hpp"
64#include "jellyfish/binary_dumper.hpp"
65#include "jellyfish/file_header.hpp"
66#include "jellyfish/hash_counter.hpp"
67#include "jellyfish/mer_iterator.hpp"
68#include "jellyfish/mer_overlap_sequence_parser.hpp"
69#include "jellyfish/thread_exec.hpp"
70#include "rank9b.h"
71
72#include "IndexHeader.hpp"
73
74// sha functionality
75#include "picosha2.h"
76
77#include <chrono>
78
79using single_parser = fastx_parser::FastxParser<fastx_parser::ReadSeq>;
80using TranscriptID = uint32_t;
81using TranscriptIDVector = std::vector<TranscriptID>;
82using KmerIDMap = std::vector<TranscriptIDVector>;
83using MerMapT = jellyfish::cooperative::hash_counter<rapmap::utils::my_mer>;
84
85bool buildSA(const std::string& outputDir, std::string& concatText, size_t tlen,
86             std::vector<int64_t>& SA) {
87  // IndexT is the signed index type
88  // UIndexT is the unsigned index type
89  using IndexT = int64_t;
90  using UIndexT = uint64_t;
91  bool success{false};
92
93  std::ofstream saStream(outputDir + "sa.bin", std::ios::binary);
94  {
95    ScopedTimer timer;
96    SA.resize(tlen, 0);
97    IndexT textLen = static_cast<IndexT>(tlen);
98    std::cerr << "Building suffix array . . . ";
99    auto ret = divsufsort64(
100        reinterpret_cast<unsigned char*>(const_cast<char*>(concatText.data())),
101        SA.data(), tlen);
102
103    success = (ret == 0);
104    if (success) {
105      std::cerr << "success\n";
106      {
107        ScopedTimer timer2;
108        std::cerr << "saving to disk . . . ";
109        cereal::BinaryOutputArchive saArchive(saStream);
110        saArchive(SA);
111        std::cerr << "done\n";
112      }
113    } else {
114      std::cerr << "FAILURE: return code from libdivsufsort64() was " << ret
115                << "\n";
116      saStream.close();
117      std::exit(1);
118    }
119    std::cerr << "done\n";
120  }
121  saStream.close();
122  return success;
123}
124
125// IndexT is the index type.
126// int32_t for "small" suffix arrays
127// int64_t for "large" ones
128template <typename IndexT>
129bool buildPerfectHash(const std::string& outputDir, std::string& concatText,
130                      size_t tlen, uint32_t k, std::vector<IndexT>& SA,
131                      uint32_t numHashThreads) {
132  //BooMap<uint64_t, rapmap::utils::SAInterval<IndexT>> intervals;
133  PerfectHashT<uint64_t, rapmap::utils::SAInterval<IndexT>> intervals;
134  intervals.setSAPtr(&SA);
135  intervals.setTextPtr(concatText.data(), concatText.length());
136
137  // The start and stop of the current interval
138  IndexT start = 0, stop = 0;
139  // An iterator to the beginning of the text
140  auto textB = concatText.begin();
141  auto textE = concatText.end();
142  // The current k-mer as a string
143  rapmap::utils::my_mer mer;
144  bool currentValid{false};
145  std::string currentKmer;
146  std::string nextKmer;
147  while (stop < tlen) {
148    // Check if the string starting at the
149    // current position is valid (i.e. doesn't contain $)
150    // and is <= k bases from the end of the string
151    nextKmer = concatText.substr(SA[stop], k);
152    if (nextKmer.length() == k and
153        nextKmer.find_first_of('$') == std::string::npos) {
154      // If this is a new k-mer, then hash the current k-mer
155      if (nextKmer != currentKmer) {
156        if (currentKmer.length() == k and
157            currentKmer.find_first_of('$') == std::string::npos) {
158          mer = rapmap::utils::my_mer(currentKmer);
159          auto bits = mer.get_bits(0, 2 * k);
160          intervals.add(std::move(bits), {start, stop});
161          // push_back(std::make_pair<uint64_t,
162          // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start,
163          // stop}));
164        }
165        currentKmer = nextKmer;
166        start = stop;
167      }
168    } else {
169      // If this isn't a valid suffix (contains a $)
170      // If the previous interval was valid, put it
171      // in the hash.
172      if (currentKmer.length() == k and
173          currentKmer.find_first_of('$') == std::string::npos) {
174        mer = rapmap::utils::my_mer(currentKmer);
175        auto bits = mer.get_bits(0, 2 * k);
176        // intervals.push_back(std::make_pair<uint64_t,
177        // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start, stop}));
178        intervals.add(std::move(bits), {start, stop});
179      }
180      // The current interval is invalid and empty
181      currentKmer = nextKmer;
182      start = stop;
183    }
184    if (stop % 1000000 == 0) {
185      std::cerr << "\r\rprocessed " << stop << " positions";
186    }
187    // We always update the end position
188    ++stop;
189  }
190  if (start < tlen) {
191    if (currentKmer.length() == k and
192        currentKmer.find_first_of('$') != std::string::npos) {
193      mer = rapmap::utils::my_mer(currentKmer);
194      auto bits = mer.get_bits(0, 2 * k);
195      // intervals.push_back(std::make_pair<uint64_t,
196      // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start, stop}));
197      intervals.add(std::move(bits), {start, stop});
198    }
199  }
200
201  // std::cerr << "\nthere are " << intervals.size() << " intervals of the
202  // selected depth\n";
203
204  std::cout << "building perfect hash function\n";
205  intervals.build(numHashThreads);
206  std::cout << "\ndone.\n";
207  std::string outputPrefix = outputDir + "hash_info";
208  std::cout << "saving the perfect hash and SA intervals to disk ... ";
209  intervals.save(outputPrefix);
210  std::cout << "done.\n";
211
212  return true;
213}
214
215bool buildSA(const std::string& outputDir, std::string& concatText, size_t tlen,
216             std::vector<int32_t>& SA) {
217  // IndexT is the signed index type
218  // UIndexT is the unsigned index type
219  using IndexT = int32_t;
220  using UIndexT = uint32_t;
221  bool success{false};
222
223  std::ofstream saStream(outputDir + "sa.bin", std::ios::binary);
224  {
225    ScopedTimer timer;
226    SA.resize(tlen, 0);
227    IndexT textLen = static_cast<IndexT>(tlen);
228    std::cerr << "Building suffix array . . . ";
229    auto ret = divsufsort(
230        reinterpret_cast<unsigned char*>(const_cast<char*>(concatText.data())),
231        SA.data(), tlen);
232
233    success = (ret == 0);
234    if (success) {
235      std::cerr << "success\n";
236      {
237        ScopedTimer timer2;
238        std::cerr << "saving to disk . . . ";
239        cereal::BinaryOutputArchive saArchive(saStream);
240        saArchive(SA);
241        std::cerr << "done\n";
242      }
243    } else {
244      std::cerr << "FAILURE: return code from libdivsufsort() was " << ret
245                << "\n";
246      saStream.close();
247      std::exit(1);
248    }
249    std::cerr << "done\n";
250  }
251  saStream.close();
252  return success;
253}
254
255// IndexT is the index type.
256// int32_t for "small" suffix arrays
257// int64_t for "large" ones
258template <typename IndexT>
259bool buildHash(const std::string& outputDir, std::string& concatText,
260               size_t tlen, uint32_t k, std::vector<IndexT>& SA) {
261  // Now, build the k-mer lookup table
262    // The base type should always be uint64_t
263    using WordT = rapmap::utils::my_mer::base_type;
264    RegHashT<WordT, rapmap::utils::SAInterval<IndexT>,
265                         rapmap::utils::KmerKeyHasher> khash;
266    //RegHashT<uint64_t, IndexT, rapmap::utils::KmerKeyHasher> overflowhash;
267    //std::cerr << "sizeof(SAInterval<IndexT>) = " << sizeof(rapmap::utils::SAInterval<IndexT>) << '\n';
268    //khash.set_empty_key(std::numeric_limits<uint64_t>::max());
269  // The start and stop of the current interval
270  IndexT start = 0, stop = 0;
271  // An iterator to the beginning of the text
272  auto textB = concatText.begin();
273  auto textE = concatText.end();
274  // The current k-mer as a string
275  rapmap::utils::my_mer mer;
276  bool currentValid{false};
277  std::string currentKmer;
278  std::string nextKmer;
279  while (stop < tlen) {
280    // Check if the string starting at the
281    // current position is valid (i.e. doesn't contain $)
282    // and is <= k bases from the end of the string
283    nextKmer = concatText.substr(SA[stop], k);
284    if (nextKmer.length() == k and
285        nextKmer.find_first_of('$') == std::string::npos) {
286      // If this is a new k-mer, then hash the current k-mer
287      if (nextKmer != currentKmer) {
288        if (currentKmer.length() == k and
289            currentKmer.find_first_of('$') == std::string::npos) {
290          mer = currentKmer;
291          auto bits = mer.word(0);
292          auto hashIt = khash.find(bits);
293          if (hashIt == khash.end()) {
294            if (start > 1) {
295              if (concatText.substr(SA[start - 1], k) ==
296                  concatText.substr(SA[start], k)) {
297                std::cerr << "T[SA[" << start - 1 << "]:" << k
298                          << "] = " << concatText.substr(SA[start - 1], k)
299                          << " = T[SA[" << start << "]:" << k << "]\n";
300                std::cerr << "start = " << start << ", stop = " << stop << "\n";
301                std::cerr << "[fatal (1)] THIS SHOULD NOT HAPPEN\n";
302                std::exit(1);
303              }
304            }
305            if (start == stop) {
306              std::cerr << "[fatal (2)] Interval is empty! (start = " << start
307                        << ") = (stop =  " << stop << ")\n";
308            }
309            if (start == stop) {
310              std::cerr << "[fatal (3)] Interval is empty! (start = " << start
311                        << ") = (stop =  " << stop << ")\n";
312            }
313
314            khash[bits] = {start, stop};
315            /*
316            IndexT len = stop - start;
317            bool overflow = (len >= std::numeric_limits<uint8_t>::max());
318            uint8_t blen = overflow ? std::numeric_limits<uint8_t>::max() :
319                static_cast<uint8_t>(len);
320            khash[bits] = {start, blen};
321            if (overflow) {
322                overflowhash[bits] = len;
323            }
324            */
325          } else {
326            std::cerr << "\nERROR (1): trying to add same suffix "
327                      << currentKmer << " (len = " << currentKmer.length()
328                      << ") multiple times!\n";
329            auto prevInt = hashIt->second;
330            std::cerr << "existing interval is [" << prevInt.begin() << ", "
331                      << prevInt.end() << ")\n";
332            for (auto x = prevInt.begin(); x < prevInt.end(); ++x) {
333              auto suff = concatText.substr(SA[x], k);
334              for (auto c : suff) {
335                std::cerr << "*" << c << "*";
336              }
337              std::cerr << " (len = " << suff.length() << ")\n";
338            }
339            std::cerr << "new interval is [" << start << ", " << stop << ")\n";
340            for (auto x = start; x < stop; ++x) {
341              auto suff = concatText.substr(SA[x], k);
342              for (auto c : suff) {
343                std::cerr << "*" << c << "*";
344              }
345              std::cerr << "\n";
346            }
347          }
348        }
349        currentKmer = nextKmer;
350        start = stop;
351      }
352    } else {
353      // If this isn't a valid suffix (contains a $)
354
355      // If the previous interval was valid, put it
356      // in the hash.
357      if (currentKmer.length() == k and
358          currentKmer.find_first_of('$') == std::string::npos) {
359        mer = currentKmer.c_str();
360        auto bits = mer.word(0);
361        auto hashIt = khash.find(bits);
362        if (hashIt == khash.end()) {
363          if (start > 2) {
364            if (concatText.substr(SA[start - 1], k) ==
365                concatText.substr(SA[start], k)) {
366              std::cerr << "T[SA[" << start - 1 << "]:" << k
367                        << "] = " << concatText.substr(SA[start - 1], k)
368                        << " = T[SA[" << start << "]:" << k << "]\n";
369              std::cerr << "start = " << start << ", stop = " << stop << "\n";
370              std::cerr << "[fatal (4)] THIS SHOULD NOT HAPPEN\n";
371              std::exit(1);
372            }
373          }
374          khash[bits] = {start, stop};
375          /*
376          IndexT len = stop - start;
377          bool overflow = (len >= std::numeric_limits<uint8_t>::max());
378          uint8_t blen = overflow ? std::numeric_limits<uint8_t>::max() :
379              static_cast<uint8_t>(len);
380          khash[bits] = {start, blen};
381          if (overflow) {
382              overflowhash[bits] = len;
383          }
384          */
385        } else {
386          std::cerr << "\nERROR (2): trying to add same suffix " << currentKmer
387                    << "multiple times!\n";
388          auto prevInt = hashIt->second;
389          std::cerr << "existing interval is [" << prevInt.begin() << ", "
390                    << prevInt.end() << ")\n";
391          for (auto x = prevInt.begin(); x < prevInt.end(); ++x) {
392            std::cerr << concatText.substr(SA[x], k) << "\n";
393          }
394          std::cerr << "new interval is [" << start << ", " << stop << ")\n";
395          for (auto x = start; x < stop; ++x) {
396            std::cerr << concatText.substr(SA[x], k) << "\n";
397          }
398        }
399      }
400      // The current interval is invalid and empty
401      currentKmer = nextKmer;
402      start = stop;
403    }
404    if (stop % 1000000 == 0) {
405      std::cerr << "\r\rprocessed " << stop << " positions";
406    }
407    // We always update the end position
408    ++stop;
409  }
410  if (start < tlen) {
411    if (currentKmer.length() == k and
412        currentKmer.find_first_of('$') != std::string::npos) {
413      mer = currentKmer.c_str();
414      khash[mer.word(0)] = {start, stop};
415      /*
416      IndexT len = stop - start;
417      bool overflow = (len >= std::numeric_limits<uint8_t>::max());
418      uint8_t blen = overflow ? std::numeric_limits<uint8_t>::max() :
419          static_cast<uint8_t>(len);
420      khash[mer.get_bits(0, 2 * k)] = {start, blen};
421      if (overflow) {
422          overflowhash[mer.get_bits(0, 2 * k)] = len;
423      }
424      */
425    }
426  }
427  std::cerr << "\nkhash had " << khash.size() << " keys\n";
428  std::ofstream hashStream(outputDir + "hash.bin", std::ios::binary);
429  {
430    ScopedTimer timer;
431    std::cerr << "saving hash to disk . . . ";
432    khash.serialize(typename spp_utils::pod_hash_serializer<WordT, rapmap::utils::SAInterval<IndexT>>(),
433                    &hashStream);
434    std::cerr << "done\n";
435  }
436  hashStream.close();
437  return true;
438}
439
440// To use the parser in the following, we get "jobs" until none is
441// available. A job behaves like a pointer to the type
442// jellyfish::sequence_list (see whole_sequence_parser.hpp).
443template <typename ParserT> //, typename CoverageCalculator>
444void indexTranscriptsSA(ParserT* parser,
445                        std::string& outputDir,
446                        bool noClipPolyA, bool usePerfectHash,
447                        uint32_t numHashThreads,
448                        std::string& sepStr,
449                        std::mutex& iomutex,
450                        std::shared_ptr<spdlog::logger> log) {
451  // Create a random uniform distribution
452  std::default_random_engine eng(271828);
453
454  std::uniform_int_distribution<> dis(0, 3);
455
456  // Hashers for getting txome signature
457  picosha2::hash256_one_by_one seqHasher; seqHasher.init();
458  picosha2::hash256_one_by_one nameHasher; nameHasher.init();
459
460  uint32_t n{0};
461  uint32_t k = rapmap::utils::my_mer::k();
462  std::vector<std::string> transcriptNames;
463  std::vector<int64_t> transcriptStarts;
464  // std::vector<uint32_t> positionIDs;
465  constexpr char bases[] = {'A', 'C', 'G', 'T'};
466  uint32_t polyAClipLength{10};
467  uint32_t numPolyAsClipped{0};
468  uint32_t numNucleotidesReplaced{0};
469  std::string polyA(polyAClipLength, 'A');
470
471  using TranscriptList = std::vector<uint32_t>;
472  using eager_iterator = MerMapT::array::eager_iterator;
473  using KmerBinT = uint64_t;
474
475  bool clipPolyA = !noClipPolyA;
476
477  // http://biology.stackexchange.com/questions/21329/whats-the-longest-transcript-known
478  // longest human transcript is Titin (108861), so this gives us a *lot* of
479  // leeway before
480  // we issue any warning.
481  size_t tooLong = 200000;
482  size_t numDistinctKmers{0};
483  size_t numKmers{0};
484  size_t currIndex{0};
485  std::cerr << "\n[Step 1 of 4] : counting k-mers\n";
486
487  // rsdic::RSDicBuilder rsdb;
488  std::vector<uint64_t>
489      onePos; // Positions in the bit array where we should write a '1'
490  // remember the initial lengths (e.g., before clipping etc., of all transcripts)
491  std::vector<uint32_t> completeLengths;
492  // the stream of transcript sequence
493  fmt::MemoryWriter txpSeqStream;
494  {
495    ScopedTimer timer;
496    // Get the read group by which this thread will
497    // communicate with the parser (*once per-thread*)
498    auto rg = parser->getReadGroup();
499
500    while (parser->refill(rg)) {
501      for (auto& read : rg) { // for each sequence
502        std::string& readStr = read.seq;
503        readStr.erase(
504            std::remove_if(readStr.begin(), readStr.end(),
505                           [](const char a) -> bool { return !(isprint(a)); }),
506            readStr.end());
507
508        seqHasher.process(readStr.begin(), readStr.end());
509
510        uint32_t readLen = readStr.size();
511        uint32_t completeLen = readLen;
512        // First, replace non ATCG nucleotides
513        for (size_t b = 0; b < readLen; ++b) {
514          readStr[b] = ::toupper(readStr[b]);
515          int c = jellyfish::mer_dna::code(readStr[b]);
516          // Replace non-ACGT bases with pseudo-random bases
517          if (jellyfish::mer_dna::not_dna(c)) {
518            char rbase = bases[dis(eng)];
519            c = jellyfish::mer_dna::code(rbase);
520            readStr[b] = rbase;
521            ++numNucleotidesReplaced;
522          }
523        }
524
525        // Now, do Kallisto-esque clipping of polyA tails
526        if (clipPolyA) {
527          if (readStr.size() > polyAClipLength and
528              readStr.substr(readStr.length() - polyAClipLength) == polyA) {
529
530            auto newEndPos = readStr.find_last_not_of("Aa");
531            // If it was all As
532            if (newEndPos == std::string::npos) {
533              log->warn("Entry with header [{}] appeared to be all A's; it "
534                        "will be removed from the index!",
535                        read.name);
536              readStr.resize(0);
537            } else {
538              readStr.resize(newEndPos + 1);
539            }
540            ++numPolyAsClipped;
541          }
542        }
543
544        readLen = readStr.size();
545        // If the transcript was completely removed during clipping, don't
546        // include it in the index.
547        if (readStr.size() > 0) {
548          // If we're suspicious the user has fed in a *genome* rather
549          // than a transcriptome, say so here.
550          if (readStr.size() >= tooLong) {
551            log->warn("Entry with header [{}] was longer than {} nucleotides.  "
552                      "Are you certain that "
553                      "we are indexing a transcriptome and not a genome?",
554                      read.name, tooLong);
555          } else if (readStr.size() < k) {
556            log->warn("Entry with header [{}], had length less than "
557                      "the k-mer length of {} (perhaps after poly-A clipping)",
558                      read.name, k);
559          }
560
561          uint32_t txpIndex = n++;
562
563          // The name of the current transcript
564          auto& recHeader = read.name;
565          auto processedName = recHeader.substr(0, recHeader.find_first_of(sepStr));
566          transcriptNames.emplace_back(processedName);
567          nameHasher.process(processedName.begin(), processedName.end());
568
569          // The position at which this transcript starts
570          transcriptStarts.push_back(currIndex);
571          // The un-molested length of this transcript
572          completeLengths.push_back(completeLen);
573
574          txpSeqStream << readStr;
575          txpSeqStream << '$';
576          currIndex += readLen + 1;
577          onePos.push_back(currIndex - 1);
578        } else {
579            log->warn("Discarding entry with header [{}], since it had length 0 "
580                      "(perhaps after poly-A clipping)",
581                      read.name);
582        }
583      }
584      if (n % 10000 == 0) {
585        std::cerr << "\r\rcounted k-mers for " << n << " transcripts";
586      }
587    }
588  }
589  std::cerr << "\n";
590
591  std::cerr << "Replaced " << numNucleotidesReplaced
592            << " non-ATCG nucleotides\n";
593  std::cerr << "Clipped poly-A tails from " << numPolyAsClipped
594            << " transcripts\n";
595
596  // Put the concatenated text in a string
597  std::string concatText = txpSeqStream.str();
598  // And clear the stream
599  txpSeqStream.clear();
600
601  // Build the suffix array
602  size_t tlen = concatText.length();
603  size_t maxInt = std::numeric_limits<int32_t>::max();
604  bool largeIndex = (tlen + 1 > maxInt);
605
606  // Make our dense bit arrray
607  BIT_ARRAY* bitArray = bit_array_create(concatText.length());
608  for (auto p : onePos) {
609    bit_array_set_bit(bitArray, p);
610  }
611
612  onePos.clear();
613  onePos.shrink_to_fit();
614
615  std::string rsFileName = outputDir + "rsd.bin";
616  FILE* rsFile = fopen(rsFileName.c_str(), "w");
617  {
618    ScopedTimer timer;
619    std::cerr << "Building rank-select dictionary and saving to disk ";
620    bit_array_save(bitArray, rsFile);
621    std::cerr << "done\n";
622  }
623  fclose(rsFile);
624  bit_array_free(bitArray);
625
626  std::ofstream seqStream(outputDir + "txpInfo.bin", std::ios::binary);
627  {
628    ScopedTimer timer;
629    std::cerr << "Writing sequence data to file . . . ";
630    cereal::BinaryOutputArchive seqArchive(seqStream);
631    seqArchive(transcriptNames);
632    if (largeIndex) {
633      seqArchive(transcriptStarts);
634    } else {
635      std::vector<int32_t> txpStarts(transcriptStarts.size(), 0);
636      size_t numTranscriptStarts = transcriptStarts.size();
637      for (size_t i = 0; i < numTranscriptStarts; ++i) {
638        txpStarts[i] = static_cast<int32_t>(transcriptStarts[i]);
639      }
640      transcriptStarts.clear();
641      transcriptStarts.shrink_to_fit();
642      { seqArchive(txpStarts); }
643    }
644    // seqArchive(positionIDs);
645    seqArchive(concatText);
646    seqArchive(completeLengths);
647    std::cerr << "done\n";
648  }
649  seqStream.close();
650
651  // clear stuff we no longer need
652  // positionIDs.clear();
653  // positionIDs.shrink_to_fit();
654  transcriptStarts.clear();
655  transcriptStarts.shrink_to_fit();
656  transcriptNames.clear();
657  transcriptNames.shrink_to_fit();
658  // done clearing
659
660  if (largeIndex) {
661    largeIndex = true;
662    std::cerr << "[info] Building 64-bit suffix array "
663                 "(length of generalized text is "
664              << tlen << " )\n";
665    using IndexT = int64_t;
666    std::vector<IndexT> SA;
667    bool success = buildSA(outputDir, concatText, tlen, SA);
668    if (!success) {
669      std::cerr << "[fatal] Could not build the suffix array!\n";
670      std::exit(1);
671    }
672
673    if (usePerfectHash) {
674      success = buildPerfectHash<IndexT>(outputDir, concatText, tlen, k, SA,
675                                         numHashThreads);
676    } else {
677      success = buildHash<IndexT>(outputDir, concatText, tlen, k, SA);
678    }
679    if (!success) {
680      std::cerr << "[fatal] Could not build the suffix interval hash!\n";
681      std::exit(1);
682    }
683  } else {
684    std::cerr << "[info] Building 32-bit suffix array "
685                 "(length of generalized text is "
686              << tlen << ")\n";
687    using IndexT = int32_t;
688    std::vector<IndexT> SA;
689
690    bool success = buildSA(outputDir, concatText, tlen, SA);
691    if (!success) {
692      std::cerr << "[fatal] Could not build the suffix array!\n";
693      std::exit(1);
694    }
695
696    if (usePerfectHash) {
697      success = buildPerfectHash<IndexT>(outputDir, concatText, tlen, k, SA,
698                                         numHashThreads);
699    } else {
700      success = buildHash<IndexT>(outputDir, concatText, tlen, k, SA);
701    }
702    if (!success) {
703      std::cerr << "[fatal] Could not build the suffix interval hash!\n";
704      std::exit(1);
705    }
706  }
707
708  seqHasher.finish();
709  nameHasher.finish();
710
711  std::string indexVersion = "q5";
712  IndexHeader header(IndexType::QUASI,
713                     indexVersion,
714                     true, k, largeIndex,
715                     usePerfectHash);
716  // Set the hash info
717  std::string seqHash;
718  std::string nameHash;
719  picosha2::get_hash_hex_string(seqHasher, seqHash);
720  picosha2::get_hash_hex_string(nameHasher, nameHash);
721  header.setSeqHash(seqHash);
722  header.setNameHash(nameHash);
723
724  // Finally (since everything presumably succeeded) write the header
725  std::ofstream headerStream(outputDir + "header.json");
726  {
727    cereal::JSONOutputArchive archive(headerStream);
728    archive(header);
729  }
730  headerStream.close();
731}
732
733std::string readData(std::istream& in)
734	{
735          std::string inputStr;
736	  std::string line;
737	  
738	  while (getline(in, line)) {
739    		if (line == "^D")
740        		break;
741    		inputStr +=line + "\n";
742	  }
743	  return inputStr;
744	}
745
746int rapMapSAIndex(int argc, char* argv[]) {
747  std::cerr << "RapMap Indexer\n";
748
749  
750
751
752
753
754
755
756  TCLAP::CmdLine cmd("RapMap Indexer");
757  TCLAP::ValueArg<std::string> transcripts("t", "transcripts",
758                                           "The transcript file to be indexed",
759                                           false, readData(std::cin), "string");
760  TCLAP::ValueArg<std::string> index(
761      "i", "index", "The location where the index should be written", true, "",
762      "path");
763  TCLAP::ValueArg<uint32_t> kval("k", "klen", "The length of k-mer to index",
764                                 false, 31, "positive integer less than 32");
765
766  TCLAP::ValueArg<std::string> customSeps("s", "headerSep", "Instead of a space or tab, break the header at the first "
767                                          "occurrence of this string, and name the transcript as the token before "
768                                          "the first separator", false, " \t", "string");
769  TCLAP::SwitchArg noClip(
770      "n", "noClip",
771      "Don't clip poly-A tails from the ends of target sequences", false);
772  TCLAP::SwitchArg perfectHash(
773      "p", "perfectHash", "Use a perfect hash instead of sparse hash --- "
774                          "somewhat slows construction, but uses less memory",
775      false);
776  /*
777  TCLAP::SwitchArg perfectHash(
778      "f", "frugalPerfectHash", "Use a frugal variant of the perfect hash --- "
779                          "this will considerably slow construction, and somewhat slow lookup, but "
780                          "hash construction and the subsequent mapping will require the least memory."
781      false);
782  */
783  TCLAP::ValueArg<uint32_t> numHashThreads(
784      "x", "numThreads",
785      "Use this many threads to build the perfect hash function", false, 4,
786      "positive integer <= # cores");
787  cmd.add(transcripts);
788  cmd.add(index);
789  cmd.add(kval);
790  cmd.add(noClip);
791  cmd.add(perfectHash);
792  cmd.add(customSeps);
793  cmd.add(numHashThreads);
794  cmd.parse(argc, argv);
795 
796  // stupid parsing for now
797  std::string transcriptFile(transcripts.getValue());
798  std::vector<std::string> transcriptFiles({transcriptFile});
799  //std::vector<std::string> transcriptFiles(transcriptFile);
800  
801
802  std::string sepStr = customSeps.getValue();
803
804  uint32_t k = kval.getValue();
805  if (k % 2 == 0) {
806    std::cerr << "Error: k must be an odd value, you chose " << k << '\n';
807    std::exit(1);
808  } else if (k > 31) {
809    std::cerr << "Error: k must not be larger than 31, you chose " << k << '\n';
810    std::exit(1);
811  }
812  rapmap::utils::my_mer::k(k);
813
814  std::string indexDir = index.getValue();
815  if (indexDir.back() != '/') {
816    indexDir += '/';
817  }
818  bool dirExists = rapmap::fs::DirExists(indexDir.c_str());
819  bool dirIsFile = rapmap::fs::FileExists(indexDir.c_str());
820  if (dirIsFile) {
821    std::cerr << "The requested index directory already exists as a file.";
822    std::exit(1);
823  }
824  if (!dirExists) {
825    rapmap::fs::MakeDir(indexDir.c_str());
826  }
827
828  std::string logPath = indexDir + "quasi_index.log";
829  auto fileSink = std::make_shared<spdlog::sinks::simple_file_sink_st>(logPath);
830  auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_st>();
831  auto consoleLog = spdlog::create("stderrLog", {consoleSink});
832  auto fileLog = spdlog::create("fileLog", {fileSink});
833  auto jointLog = spdlog::create("jointLog", {fileSink, consoleSink});
834
835  size_t numThreads{1};
836
837  std::unique_ptr<single_parser> transcriptParserPtr{nullptr};
838
839  size_t numProd = 1;
840  transcriptParserPtr.reset(
841			    new single_parser(transcriptFiles, numThreads, numProd));
842  transcriptParserPtr->start();
843  bool noClipPolyA = noClip.getValue();
844  bool usePerfectHash = perfectHash.getValue();
845  uint32_t numPerfectHashThreads = numHashThreads.getValue();
846  std::mutex iomutex;
847  indexTranscriptsSA(transcriptParserPtr.get(), indexDir, noClipPolyA,
848                     usePerfectHash, numPerfectHashThreads, sepStr, iomutex, jointLog);
849
850  // Output info about the reference
851  std::ofstream refInfoStream(indexDir + "refInfo.json");
852  {
853    cereal::JSONOutputArchive archive(refInfoStream);
854    archive(cereal::make_nvp("ReferenceFiles", transcriptFiles));
855  }
856  refInfoStream.close();
857
858  return 0;
859}