· 7 years ago · Nov 24, 2018, 02:28 PM
1//
2// RapMap - Rapid and accurate mapping of short reads to transcriptomes using
3// quasi-mapping.
4// Copyright (C) 2015, 2016, 2017 Rob Patro, Avi Srivastava, Hirak Sarkar
5//
6// This file is part of RapMap.
7//
8// RapMap is free software: you can redistribute it and/or modify
9// it under the terms of the GNU General Public License as published by
10// the Free Software Foundation, either version 3 of the License, or
11// (at your option) any later version.
12//
13// RapMap is distributed in the hope that it will be useful,
14// but WITHOUT ANY WARRANTY; without even the implied warranty of
15// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16// GNU General Public License for more details.
17//
18// You should have received a copy of the GNU General Public License
19// along with RapMap. If not, see <http://www.gnu.org/licenses/>.
20//
21
22#include <algorithm>
23#include <cctype>
24#include <cstdio>
25#include <fstream>
26#include <iostream>
27#include <iterator>
28#include <memory>
29#include <mutex>
30#include <random>
31#include <type_traits>
32#include <unordered_map>
33#include <vector>
34#include <string.h>
35
36#include "tclap/CmdLine.h"
37
38#include <cereal/archives/binary.hpp>
39#include <cereal/archives/json.hpp>
40#include <cereal/types/string.hpp>
41#include <cereal/types/unordered_map.hpp>
42#include <cereal/types/utility.hpp>
43#include <cereal/types/vector.hpp>
44
45#include "BooMap.hpp"
46#include "FrugalBooMap.hpp"
47#include "xxhash.h"
48
49#include "spdlog/spdlog.h"
50
51#include "FastxParser.hpp"
52// Jellyfish 2 include
53#include "jellyfish/mer_dna.hpp"
54
55#include "divsufsort.h"
56#include "divsufsort64.h"
57
58#include "RapMapFileSystem.hpp"
59#include "RapMapUtils.hpp"
60#include "ScopedTimer.hpp"
61#include "bit_array.h"
62
63#include "JFRaw.hpp"
64#include "jellyfish/binary_dumper.hpp"
65#include "jellyfish/file_header.hpp"
66#include "jellyfish/hash_counter.hpp"
67#include "jellyfish/mer_iterator.hpp"
68#include "jellyfish/mer_overlap_sequence_parser.hpp"
69#include "jellyfish/thread_exec.hpp"
70#include "rank9b.h"
71
72#include "IndexHeader.hpp"
73
74// sha functionality
75#include "picosha2.h"
76
77#include <chrono>
78
79using single_parser = fastx_parser::FastxParser<fastx_parser::ReadSeq>;
80using TranscriptID = uint32_t;
81using TranscriptIDVector = std::vector<TranscriptID>;
82using KmerIDMap = std::vector<TranscriptIDVector>;
83using MerMapT = jellyfish::cooperative::hash_counter<rapmap::utils::my_mer>;
84
85bool buildSA(const std::string& outputDir, std::string& concatText, size_t tlen,
86 std::vector<int64_t>& SA) {
87 // IndexT is the signed index type
88 // UIndexT is the unsigned index type
89 using IndexT = int64_t;
90 using UIndexT = uint64_t;
91 bool success{false};
92
93 std::ofstream saStream(outputDir + "sa.bin", std::ios::binary);
94 {
95 ScopedTimer timer;
96 SA.resize(tlen, 0);
97 IndexT textLen = static_cast<IndexT>(tlen);
98 std::cerr << "Building suffix array . . . ";
99 auto ret = divsufsort64(
100 reinterpret_cast<unsigned char*>(const_cast<char*>(concatText.data())),
101 SA.data(), tlen);
102
103 success = (ret == 0);
104 if (success) {
105 std::cerr << "success\n";
106 {
107 ScopedTimer timer2;
108 std::cerr << "saving to disk . . . ";
109 cereal::BinaryOutputArchive saArchive(saStream);
110 saArchive(SA);
111 std::cerr << "done\n";
112 }
113 } else {
114 std::cerr << "FAILURE: return code from libdivsufsort64() was " << ret
115 << "\n";
116 saStream.close();
117 std::exit(1);
118 }
119 std::cerr << "done\n";
120 }
121 saStream.close();
122 return success;
123}
124
125// IndexT is the index type.
126// int32_t for "small" suffix arrays
127// int64_t for "large" ones
128template <typename IndexT>
129bool buildPerfectHash(const std::string& outputDir, std::string& concatText,
130 size_t tlen, uint32_t k, std::vector<IndexT>& SA,
131 uint32_t numHashThreads) {
132 //BooMap<uint64_t, rapmap::utils::SAInterval<IndexT>> intervals;
133 PerfectHashT<uint64_t, rapmap::utils::SAInterval<IndexT>> intervals;
134 intervals.setSAPtr(&SA);
135 intervals.setTextPtr(concatText.data(), concatText.length());
136
137 // The start and stop of the current interval
138 IndexT start = 0, stop = 0;
139 // An iterator to the beginning of the text
140 auto textB = concatText.begin();
141 auto textE = concatText.end();
142 // The current k-mer as a string
143 rapmap::utils::my_mer mer;
144 bool currentValid{false};
145 std::string currentKmer;
146 std::string nextKmer;
147 while (stop < tlen) {
148 // Check if the string starting at the
149 // current position is valid (i.e. doesn't contain $)
150 // and is <= k bases from the end of the string
151 nextKmer = concatText.substr(SA[stop], k);
152 if (nextKmer.length() == k and
153 nextKmer.find_first_of('$') == std::string::npos) {
154 // If this is a new k-mer, then hash the current k-mer
155 if (nextKmer != currentKmer) {
156 if (currentKmer.length() == k and
157 currentKmer.find_first_of('$') == std::string::npos) {
158 mer = rapmap::utils::my_mer(currentKmer);
159 auto bits = mer.get_bits(0, 2 * k);
160 intervals.add(std::move(bits), {start, stop});
161 // push_back(std::make_pair<uint64_t,
162 // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start,
163 // stop}));
164 }
165 currentKmer = nextKmer;
166 start = stop;
167 }
168 } else {
169 // If this isn't a valid suffix (contains a $)
170 // If the previous interval was valid, put it
171 // in the hash.
172 if (currentKmer.length() == k and
173 currentKmer.find_first_of('$') == std::string::npos) {
174 mer = rapmap::utils::my_mer(currentKmer);
175 auto bits = mer.get_bits(0, 2 * k);
176 // intervals.push_back(std::make_pair<uint64_t,
177 // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start, stop}));
178 intervals.add(std::move(bits), {start, stop});
179 }
180 // The current interval is invalid and empty
181 currentKmer = nextKmer;
182 start = stop;
183 }
184 if (stop % 1000000 == 0) {
185 std::cerr << "\r\rprocessed " << stop << " positions";
186 }
187 // We always update the end position
188 ++stop;
189 }
190 if (start < tlen) {
191 if (currentKmer.length() == k and
192 currentKmer.find_first_of('$') != std::string::npos) {
193 mer = rapmap::utils::my_mer(currentKmer);
194 auto bits = mer.get_bits(0, 2 * k);
195 // intervals.push_back(std::make_pair<uint64_t,
196 // rapmap::utils::SAInterval<IndexT>>(std::move(bits), {start, stop}));
197 intervals.add(std::move(bits), {start, stop});
198 }
199 }
200
201 // std::cerr << "\nthere are " << intervals.size() << " intervals of the
202 // selected depth\n";
203
204 std::cout << "building perfect hash function\n";
205 intervals.build(numHashThreads);
206 std::cout << "\ndone.\n";
207 std::string outputPrefix = outputDir + "hash_info";
208 std::cout << "saving the perfect hash and SA intervals to disk ... ";
209 intervals.save(outputPrefix);
210 std::cout << "done.\n";
211
212 return true;
213}
214
215bool buildSA(const std::string& outputDir, std::string& concatText, size_t tlen,
216 std::vector<int32_t>& SA) {
217 // IndexT is the signed index type
218 // UIndexT is the unsigned index type
219 using IndexT = int32_t;
220 using UIndexT = uint32_t;
221 bool success{false};
222
223 std::ofstream saStream(outputDir + "sa.bin", std::ios::binary);
224 {
225 ScopedTimer timer;
226 SA.resize(tlen, 0);
227 IndexT textLen = static_cast<IndexT>(tlen);
228 std::cerr << "Building suffix array . . . ";
229 auto ret = divsufsort(
230 reinterpret_cast<unsigned char*>(const_cast<char*>(concatText.data())),
231 SA.data(), tlen);
232
233 success = (ret == 0);
234 if (success) {
235 std::cerr << "success\n";
236 {
237 ScopedTimer timer2;
238 std::cerr << "saving to disk . . . ";
239 cereal::BinaryOutputArchive saArchive(saStream);
240 saArchive(SA);
241 std::cerr << "done\n";
242 }
243 } else {
244 std::cerr << "FAILURE: return code from libdivsufsort() was " << ret
245 << "\n";
246 saStream.close();
247 std::exit(1);
248 }
249 std::cerr << "done\n";
250 }
251 saStream.close();
252 return success;
253}
254
255// IndexT is the index type.
256// int32_t for "small" suffix arrays
257// int64_t for "large" ones
258template <typename IndexT>
259bool buildHash(const std::string& outputDir, std::string& concatText,
260 size_t tlen, uint32_t k, std::vector<IndexT>& SA) {
261 // Now, build the k-mer lookup table
262 // The base type should always be uint64_t
263 using WordT = rapmap::utils::my_mer::base_type;
264 RegHashT<WordT, rapmap::utils::SAInterval<IndexT>,
265 rapmap::utils::KmerKeyHasher> khash;
266 //RegHashT<uint64_t, IndexT, rapmap::utils::KmerKeyHasher> overflowhash;
267 //std::cerr << "sizeof(SAInterval<IndexT>) = " << sizeof(rapmap::utils::SAInterval<IndexT>) << '\n';
268 //khash.set_empty_key(std::numeric_limits<uint64_t>::max());
269 // The start and stop of the current interval
270 IndexT start = 0, stop = 0;
271 // An iterator to the beginning of the text
272 auto textB = concatText.begin();
273 auto textE = concatText.end();
274 // The current k-mer as a string
275 rapmap::utils::my_mer mer;
276 bool currentValid{false};
277 std::string currentKmer;
278 std::string nextKmer;
279 while (stop < tlen) {
280 // Check if the string starting at the
281 // current position is valid (i.e. doesn't contain $)
282 // and is <= k bases from the end of the string
283 nextKmer = concatText.substr(SA[stop], k);
284 if (nextKmer.length() == k and
285 nextKmer.find_first_of('$') == std::string::npos) {
286 // If this is a new k-mer, then hash the current k-mer
287 if (nextKmer != currentKmer) {
288 if (currentKmer.length() == k and
289 currentKmer.find_first_of('$') == std::string::npos) {
290 mer = currentKmer;
291 auto bits = mer.word(0);
292 auto hashIt = khash.find(bits);
293 if (hashIt == khash.end()) {
294 if (start > 1) {
295 if (concatText.substr(SA[start - 1], k) ==
296 concatText.substr(SA[start], k)) {
297 std::cerr << "T[SA[" << start - 1 << "]:" << k
298 << "] = " << concatText.substr(SA[start - 1], k)
299 << " = T[SA[" << start << "]:" << k << "]\n";
300 std::cerr << "start = " << start << ", stop = " << stop << "\n";
301 std::cerr << "[fatal (1)] THIS SHOULD NOT HAPPEN\n";
302 std::exit(1);
303 }
304 }
305 if (start == stop) {
306 std::cerr << "[fatal (2)] Interval is empty! (start = " << start
307 << ") = (stop = " << stop << ")\n";
308 }
309 if (start == stop) {
310 std::cerr << "[fatal (3)] Interval is empty! (start = " << start
311 << ") = (stop = " << stop << ")\n";
312 }
313
314 khash[bits] = {start, stop};
315 /*
316 IndexT len = stop - start;
317 bool overflow = (len >= std::numeric_limits<uint8_t>::max());
318 uint8_t blen = overflow ? std::numeric_limits<uint8_t>::max() :
319 static_cast<uint8_t>(len);
320 khash[bits] = {start, blen};
321 if (overflow) {
322 overflowhash[bits] = len;
323 }
324 */
325 } else {
326 std::cerr << "\nERROR (1): trying to add same suffix "
327 << currentKmer << " (len = " << currentKmer.length()
328 << ") multiple times!\n";
329 auto prevInt = hashIt->second;
330 std::cerr << "existing interval is [" << prevInt.begin() << ", "
331 << prevInt.end() << ")\n";
332 for (auto x = prevInt.begin(); x < prevInt.end(); ++x) {
333 auto suff = concatText.substr(SA[x], k);
334 for (auto c : suff) {
335 std::cerr << "*" << c << "*";
336 }
337 std::cerr << " (len = " << suff.length() << ")\n";
338 }
339 std::cerr << "new interval is [" << start << ", " << stop << ")\n";
340 for (auto x = start; x < stop; ++x) {
341 auto suff = concatText.substr(SA[x], k);
342 for (auto c : suff) {
343 std::cerr << "*" << c << "*";
344 }
345 std::cerr << "\n";
346 }
347 }
348 }
349 currentKmer = nextKmer;
350 start = stop;
351 }
352 } else {
353 // If this isn't a valid suffix (contains a $)
354
355 // If the previous interval was valid, put it
356 // in the hash.
357 if (currentKmer.length() == k and
358 currentKmer.find_first_of('$') == std::string::npos) {
359 mer = currentKmer.c_str();
360 auto bits = mer.word(0);
361 auto hashIt = khash.find(bits);
362 if (hashIt == khash.end()) {
363 if (start > 2) {
364 if (concatText.substr(SA[start - 1], k) ==
365 concatText.substr(SA[start], k)) {
366 std::cerr << "T[SA[" << start - 1 << "]:" << k
367 << "] = " << concatText.substr(SA[start - 1], k)
368 << " = T[SA[" << start << "]:" << k << "]\n";
369 std::cerr << "start = " << start << ", stop = " << stop << "\n";
370 std::cerr << "[fatal (4)] THIS SHOULD NOT HAPPEN\n";
371 std::exit(1);
372 }
373 }
374 khash[bits] = {start, stop};
375 /*
376 IndexT len = stop - start;
377 bool overflow = (len >= std::numeric_limits<uint8_t>::max());
378 uint8_t blen = overflow ? std::numeric_limits<uint8_t>::max() :
379 static_cast<uint8_t>(len);
380 khash[bits] = {start, blen};
381 if (overflow) {
382 overflowhash[bits] = len;
383 }
384 */
385 } else {
386 std::cerr << "\nERROR (2): trying to add same suffix " << currentKmer
387 << "multiple times!\n";
388 auto prevInt = hashIt->second;
389 std::cerr << "existing interval is [" << prevInt.begin() << ", "
390 << prevInt.end() << ")\n";
391 for (auto x = prevInt.begin(); x < prevInt.end(); ++x) {
392 std::cerr << concatText.substr(SA[x], k) << "\n";
393 }
394 std::cerr << "new interval is [" << start << ", " << stop << ")\n";
395 for (auto x = start; x < stop; ++x) {
396 std::cerr << concatText.substr(SA[x], k) << "\n";
397 }
398 }
399 }
400 // The current interval is invalid and empty
401 currentKmer = nextKmer;
402 start = stop;
403 }
404 if (stop % 1000000 == 0) {
405 std::cerr << "\r\rprocessed " << stop << " positions";
406 }
407 // We always update the end position
408 ++stop;
409 }
410 if (start < tlen) {
411 if (currentKmer.length() == k and
412 currentKmer.find_first_of('$') != std::string::npos) {
413 mer = currentKmer.c_str();
414 khash[mer.word(0)] = {start, stop};
415 /*
416 IndexT len = stop - start;
417 bool overflow = (len >= std::numeric_limits<uint8_t>::max());
418 uint8_t blen = overflow ? std::numeric_limits<uint8_t>::max() :
419 static_cast<uint8_t>(len);
420 khash[mer.get_bits(0, 2 * k)] = {start, blen};
421 if (overflow) {
422 overflowhash[mer.get_bits(0, 2 * k)] = len;
423 }
424 */
425 }
426 }
427 std::cerr << "\nkhash had " << khash.size() << " keys\n";
428 std::ofstream hashStream(outputDir + "hash.bin", std::ios::binary);
429 {
430 ScopedTimer timer;
431 std::cerr << "saving hash to disk . . . ";
432 khash.serialize(typename spp_utils::pod_hash_serializer<WordT, rapmap::utils::SAInterval<IndexT>>(),
433 &hashStream);
434 std::cerr << "done\n";
435 }
436 hashStream.close();
437 return true;
438}
439
440// To use the parser in the following, we get "jobs" until none is
441// available. A job behaves like a pointer to the type
442// jellyfish::sequence_list (see whole_sequence_parser.hpp).
443template <typename ParserT> //, typename CoverageCalculator>
444void indexTranscriptsSA(ParserT* parser,
445 std::string& outputDir,
446 bool noClipPolyA, bool usePerfectHash,
447 uint32_t numHashThreads,
448 std::string& sepStr,
449 std::mutex& iomutex,
450 std::shared_ptr<spdlog::logger> log) {
451 // Create a random uniform distribution
452 std::default_random_engine eng(271828);
453
454 std::uniform_int_distribution<> dis(0, 3);
455
456 // Hashers for getting txome signature
457 picosha2::hash256_one_by_one seqHasher; seqHasher.init();
458 picosha2::hash256_one_by_one nameHasher; nameHasher.init();
459
460 uint32_t n{0};
461 uint32_t k = rapmap::utils::my_mer::k();
462 std::vector<std::string> transcriptNames;
463 std::vector<int64_t> transcriptStarts;
464 // std::vector<uint32_t> positionIDs;
465 constexpr char bases[] = {'A', 'C', 'G', 'T'};
466 uint32_t polyAClipLength{10};
467 uint32_t numPolyAsClipped{0};
468 uint32_t numNucleotidesReplaced{0};
469 std::string polyA(polyAClipLength, 'A');
470
471 using TranscriptList = std::vector<uint32_t>;
472 using eager_iterator = MerMapT::array::eager_iterator;
473 using KmerBinT = uint64_t;
474
475 bool clipPolyA = !noClipPolyA;
476
477 // http://biology.stackexchange.com/questions/21329/whats-the-longest-transcript-known
478 // longest human transcript is Titin (108861), so this gives us a *lot* of
479 // leeway before
480 // we issue any warning.
481 size_t tooLong = 200000;
482 size_t numDistinctKmers{0};
483 size_t numKmers{0};
484 size_t currIndex{0};
485 std::cerr << "\n[Step 1 of 4] : counting k-mers\n";
486
487 // rsdic::RSDicBuilder rsdb;
488 std::vector<uint64_t>
489 onePos; // Positions in the bit array where we should write a '1'
490 // remember the initial lengths (e.g., before clipping etc., of all transcripts)
491 std::vector<uint32_t> completeLengths;
492 // the stream of transcript sequence
493 fmt::MemoryWriter txpSeqStream;
494 {
495 ScopedTimer timer;
496 // Get the read group by which this thread will
497 // communicate with the parser (*once per-thread*)
498 auto rg = parser->getReadGroup();
499
500 while (parser->refill(rg)) {
501 for (auto& read : rg) { // for each sequence
502 std::string& readStr = read.seq;
503 readStr.erase(
504 std::remove_if(readStr.begin(), readStr.end(),
505 [](const char a) -> bool { return !(isprint(a)); }),
506 readStr.end());
507
508 seqHasher.process(readStr.begin(), readStr.end());
509
510 uint32_t readLen = readStr.size();
511 uint32_t completeLen = readLen;
512 // First, replace non ATCG nucleotides
513 for (size_t b = 0; b < readLen; ++b) {
514 readStr[b] = ::toupper(readStr[b]);
515 int c = jellyfish::mer_dna::code(readStr[b]);
516 // Replace non-ACGT bases with pseudo-random bases
517 if (jellyfish::mer_dna::not_dna(c)) {
518 char rbase = bases[dis(eng)];
519 c = jellyfish::mer_dna::code(rbase);
520 readStr[b] = rbase;
521 ++numNucleotidesReplaced;
522 }
523 }
524
525 // Now, do Kallisto-esque clipping of polyA tails
526 if (clipPolyA) {
527 if (readStr.size() > polyAClipLength and
528 readStr.substr(readStr.length() - polyAClipLength) == polyA) {
529
530 auto newEndPos = readStr.find_last_not_of("Aa");
531 // If it was all As
532 if (newEndPos == std::string::npos) {
533 log->warn("Entry with header [{}] appeared to be all A's; it "
534 "will be removed from the index!",
535 read.name);
536 readStr.resize(0);
537 } else {
538 readStr.resize(newEndPos + 1);
539 }
540 ++numPolyAsClipped;
541 }
542 }
543
544 readLen = readStr.size();
545 // If the transcript was completely removed during clipping, don't
546 // include it in the index.
547 if (readStr.size() > 0) {
548 // If we're suspicious the user has fed in a *genome* rather
549 // than a transcriptome, say so here.
550 if (readStr.size() >= tooLong) {
551 log->warn("Entry with header [{}] was longer than {} nucleotides. "
552 "Are you certain that "
553 "we are indexing a transcriptome and not a genome?",
554 read.name, tooLong);
555 } else if (readStr.size() < k) {
556 log->warn("Entry with header [{}], had length less than "
557 "the k-mer length of {} (perhaps after poly-A clipping)",
558 read.name, k);
559 }
560
561 uint32_t txpIndex = n++;
562
563 // The name of the current transcript
564 auto& recHeader = read.name;
565 auto processedName = recHeader.substr(0, recHeader.find_first_of(sepStr));
566 transcriptNames.emplace_back(processedName);
567 nameHasher.process(processedName.begin(), processedName.end());
568
569 // The position at which this transcript starts
570 transcriptStarts.push_back(currIndex);
571 // The un-molested length of this transcript
572 completeLengths.push_back(completeLen);
573
574 txpSeqStream << readStr;
575 txpSeqStream << '$';
576 currIndex += readLen + 1;
577 onePos.push_back(currIndex - 1);
578 } else {
579 log->warn("Discarding entry with header [{}], since it had length 0 "
580 "(perhaps after poly-A clipping)",
581 read.name);
582 }
583 }
584 if (n % 10000 == 0) {
585 std::cerr << "\r\rcounted k-mers for " << n << " transcripts";
586 }
587 }
588 }
589 std::cerr << "\n";
590
591 std::cerr << "Replaced " << numNucleotidesReplaced
592 << " non-ATCG nucleotides\n";
593 std::cerr << "Clipped poly-A tails from " << numPolyAsClipped
594 << " transcripts\n";
595
596 // Put the concatenated text in a string
597 std::string concatText = txpSeqStream.str();
598 // And clear the stream
599 txpSeqStream.clear();
600
601 // Build the suffix array
602 size_t tlen = concatText.length();
603 size_t maxInt = std::numeric_limits<int32_t>::max();
604 bool largeIndex = (tlen + 1 > maxInt);
605
606 // Make our dense bit arrray
607 BIT_ARRAY* bitArray = bit_array_create(concatText.length());
608 for (auto p : onePos) {
609 bit_array_set_bit(bitArray, p);
610 }
611
612 onePos.clear();
613 onePos.shrink_to_fit();
614
615 std::string rsFileName = outputDir + "rsd.bin";
616 FILE* rsFile = fopen(rsFileName.c_str(), "w");
617 {
618 ScopedTimer timer;
619 std::cerr << "Building rank-select dictionary and saving to disk ";
620 bit_array_save(bitArray, rsFile);
621 std::cerr << "done\n";
622 }
623 fclose(rsFile);
624 bit_array_free(bitArray);
625
626 std::ofstream seqStream(outputDir + "txpInfo.bin", std::ios::binary);
627 {
628 ScopedTimer timer;
629 std::cerr << "Writing sequence data to file . . . ";
630 cereal::BinaryOutputArchive seqArchive(seqStream);
631 seqArchive(transcriptNames);
632 if (largeIndex) {
633 seqArchive(transcriptStarts);
634 } else {
635 std::vector<int32_t> txpStarts(transcriptStarts.size(), 0);
636 size_t numTranscriptStarts = transcriptStarts.size();
637 for (size_t i = 0; i < numTranscriptStarts; ++i) {
638 txpStarts[i] = static_cast<int32_t>(transcriptStarts[i]);
639 }
640 transcriptStarts.clear();
641 transcriptStarts.shrink_to_fit();
642 { seqArchive(txpStarts); }
643 }
644 // seqArchive(positionIDs);
645 seqArchive(concatText);
646 seqArchive(completeLengths);
647 std::cerr << "done\n";
648 }
649 seqStream.close();
650
651 // clear stuff we no longer need
652 // positionIDs.clear();
653 // positionIDs.shrink_to_fit();
654 transcriptStarts.clear();
655 transcriptStarts.shrink_to_fit();
656 transcriptNames.clear();
657 transcriptNames.shrink_to_fit();
658 // done clearing
659
660 if (largeIndex) {
661 largeIndex = true;
662 std::cerr << "[info] Building 64-bit suffix array "
663 "(length of generalized text is "
664 << tlen << " )\n";
665 using IndexT = int64_t;
666 std::vector<IndexT> SA;
667 bool success = buildSA(outputDir, concatText, tlen, SA);
668 if (!success) {
669 std::cerr << "[fatal] Could not build the suffix array!\n";
670 std::exit(1);
671 }
672
673 if (usePerfectHash) {
674 success = buildPerfectHash<IndexT>(outputDir, concatText, tlen, k, SA,
675 numHashThreads);
676 } else {
677 success = buildHash<IndexT>(outputDir, concatText, tlen, k, SA);
678 }
679 if (!success) {
680 std::cerr << "[fatal] Could not build the suffix interval hash!\n";
681 std::exit(1);
682 }
683 } else {
684 std::cerr << "[info] Building 32-bit suffix array "
685 "(length of generalized text is "
686 << tlen << ")\n";
687 using IndexT = int32_t;
688 std::vector<IndexT> SA;
689
690 bool success = buildSA(outputDir, concatText, tlen, SA);
691 if (!success) {
692 std::cerr << "[fatal] Could not build the suffix array!\n";
693 std::exit(1);
694 }
695
696 if (usePerfectHash) {
697 success = buildPerfectHash<IndexT>(outputDir, concatText, tlen, k, SA,
698 numHashThreads);
699 } else {
700 success = buildHash<IndexT>(outputDir, concatText, tlen, k, SA);
701 }
702 if (!success) {
703 std::cerr << "[fatal] Could not build the suffix interval hash!\n";
704 std::exit(1);
705 }
706 }
707
708 seqHasher.finish();
709 nameHasher.finish();
710
711 std::string indexVersion = "q5";
712 IndexHeader header(IndexType::QUASI,
713 indexVersion,
714 true, k, largeIndex,
715 usePerfectHash);
716 // Set the hash info
717 std::string seqHash;
718 std::string nameHash;
719 picosha2::get_hash_hex_string(seqHasher, seqHash);
720 picosha2::get_hash_hex_string(nameHasher, nameHash);
721 header.setSeqHash(seqHash);
722 header.setNameHash(nameHash);
723
724 // Finally (since everything presumably succeeded) write the header
725 std::ofstream headerStream(outputDir + "header.json");
726 {
727 cereal::JSONOutputArchive archive(headerStream);
728 archive(header);
729 }
730 headerStream.close();
731}
732
733std::string readData(std::istream& in)
734 {
735 std::string inputStr;
736 std::string line;
737
738 while (getline(in, line)) {
739 if (line == "^D")
740 break;
741 inputStr +=line + "\n";
742 }
743 return inputStr;
744 }
745
746int rapMapSAIndex(int argc, char* argv[]) {
747 std::cerr << "RapMap Indexer\n";
748
749
750
751
752
753
754
755
756 TCLAP::CmdLine cmd("RapMap Indexer");
757 TCLAP::ValueArg<std::string> transcripts("t", "transcripts",
758 "The transcript file to be indexed",
759 false, readData(std::cin), "string");
760 TCLAP::ValueArg<std::string> index(
761 "i", "index", "The location where the index should be written", true, "",
762 "path");
763 TCLAP::ValueArg<uint32_t> kval("k", "klen", "The length of k-mer to index",
764 false, 31, "positive integer less than 32");
765
766 TCLAP::ValueArg<std::string> customSeps("s", "headerSep", "Instead of a space or tab, break the header at the first "
767 "occurrence of this string, and name the transcript as the token before "
768 "the first separator", false, " \t", "string");
769 TCLAP::SwitchArg noClip(
770 "n", "noClip",
771 "Don't clip poly-A tails from the ends of target sequences", false);
772 TCLAP::SwitchArg perfectHash(
773 "p", "perfectHash", "Use a perfect hash instead of sparse hash --- "
774 "somewhat slows construction, but uses less memory",
775 false);
776 /*
777 TCLAP::SwitchArg perfectHash(
778 "f", "frugalPerfectHash", "Use a frugal variant of the perfect hash --- "
779 "this will considerably slow construction, and somewhat slow lookup, but "
780 "hash construction and the subsequent mapping will require the least memory."
781 false);
782 */
783 TCLAP::ValueArg<uint32_t> numHashThreads(
784 "x", "numThreads",
785 "Use this many threads to build the perfect hash function", false, 4,
786 "positive integer <= # cores");
787 cmd.add(transcripts);
788 cmd.add(index);
789 cmd.add(kval);
790 cmd.add(noClip);
791 cmd.add(perfectHash);
792 cmd.add(customSeps);
793 cmd.add(numHashThreads);
794 cmd.parse(argc, argv);
795
796 // stupid parsing for now
797 std::string transcriptFile(transcripts.getValue());
798 std::vector<std::string> transcriptFiles({transcriptFile});
799 //std::vector<std::string> transcriptFiles(transcriptFile);
800
801
802 std::string sepStr = customSeps.getValue();
803
804 uint32_t k = kval.getValue();
805 if (k % 2 == 0) {
806 std::cerr << "Error: k must be an odd value, you chose " << k << '\n';
807 std::exit(1);
808 } else if (k > 31) {
809 std::cerr << "Error: k must not be larger than 31, you chose " << k << '\n';
810 std::exit(1);
811 }
812 rapmap::utils::my_mer::k(k);
813
814 std::string indexDir = index.getValue();
815 if (indexDir.back() != '/') {
816 indexDir += '/';
817 }
818 bool dirExists = rapmap::fs::DirExists(indexDir.c_str());
819 bool dirIsFile = rapmap::fs::FileExists(indexDir.c_str());
820 if (dirIsFile) {
821 std::cerr << "The requested index directory already exists as a file.";
822 std::exit(1);
823 }
824 if (!dirExists) {
825 rapmap::fs::MakeDir(indexDir.c_str());
826 }
827
828 std::string logPath = indexDir + "quasi_index.log";
829 auto fileSink = std::make_shared<spdlog::sinks::simple_file_sink_st>(logPath);
830 auto consoleSink = std::make_shared<spdlog::sinks::stderr_sink_st>();
831 auto consoleLog = spdlog::create("stderrLog", {consoleSink});
832 auto fileLog = spdlog::create("fileLog", {fileSink});
833 auto jointLog = spdlog::create("jointLog", {fileSink, consoleSink});
834
835 size_t numThreads{1};
836
837 std::unique_ptr<single_parser> transcriptParserPtr{nullptr};
838
839 size_t numProd = 1;
840 transcriptParserPtr.reset(
841 new single_parser(transcriptFiles, numThreads, numProd));
842 transcriptParserPtr->start();
843 bool noClipPolyA = noClip.getValue();
844 bool usePerfectHash = perfectHash.getValue();
845 uint32_t numPerfectHashThreads = numHashThreads.getValue();
846 std::mutex iomutex;
847 indexTranscriptsSA(transcriptParserPtr.get(), indexDir, noClipPolyA,
848 usePerfectHash, numPerfectHashThreads, sepStr, iomutex, jointLog);
849
850 // Output info about the reference
851 std::ofstream refInfoStream(indexDir + "refInfo.json");
852 {
853 cereal::JSONOutputArchive archive(refInfoStream);
854 archive(cereal::make_nvp("ReferenceFiles", transcriptFiles));
855 }
856 refInfoStream.close();
857
858 return 0;
859}