-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblhead.cpp
131 lines (106 loc) · 3.35 KB
/
blhead.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/*
* Unix head command, but for biological sequence files
*
*/
#include <iostream>
#include <queue>
#include <string>
#include <vector>
#include <seqan/seq_io.h>
#include <tclap/CmdLine.h>
#include <SeqFileInWrapper.h>
using std::cerr;
using std::cin;
using std::cout;
using std::endl;
using std::ifstream;
using std::istream;
using std::queue;
using std::string;
using std::vector;
using namespace seqan;
using namespace bltools;
int main(int argc, char * argv[]) {
TCLAP::CmdLine cmd("Equivalent of `head' for sequence files", ' ', "0.0");
TCLAP::ValueArg<string> format_arg("o", "output-format",
"Output format: fasta or fastq; fasta is default",
false, "fasta", "fast[aq]", cmd);
TCLAP::ValueArg<int> nlines_arg("n", "lines",
"print the first n lines of each file",
false, 10, "int", cmd);
TCLAP::UnlabeledMultiArg<string> files("FILE(s)", "filenames", false,
"file name(s)", cmd, false);
cmd.parse(argc, argv);
string format = format_arg.getValue();
vector<string> infiles = files.getValue();
if(infiles.size() == 0) infiles.push_back("-");
int nlines = nlines_arg.getValue();
unsigned look_ahead = 0;
if(nlines < 0) {
look_ahead = -1 * nlines;
}
SeqFileOut out_handle(cout, Fasta());
if(format == "fasta") {
setFormat(out_handle, Fasta());
} else if(format == "fastq") {
setFormat(out_handle, Fastq());
} else {
cerr << "Unrecognized output format";
return 1;
}
CharString id;
queue<CharString> ids;
CharString seq; // CharString more flexible than Dna5String
queue<CharString> seqs;
CharString qual;
queue<CharString> quals;
SeqFileInWrapper seq_handle;
for(string& infile: infiles) {
try {
seq_handle.open(infile);
} catch(Exception const &e) {
cerr << "Could not open " << infile << endl;
seq_handle.close();
return 1;
}
int nrecs_read = 0;
// Fill up seqs, quals, ids until look_ahead is reached, then for
// every additional record, pop one off of seqs, quals, and ids, and
// push the new one on until the end of the file is reached.
while(!seq_handle.atEnd() && ((look_ahead == 0 && nrecs_read < nlines) || look_ahead > 0)) {
try {
readRecord(id, seq, qual, seq_handle.sqh);
} catch (Exception const &e) {
cerr << "Error: " << e.what() << endl;
seq_handle.close();
close(out_handle);
return 1;
} // End try-catch for record reading.
if(look_ahead > 0) {
seqs.push(seq); ids.push(id); quals.push(qual);
if(seqs.size() > look_ahead) {
id = ids.front(); seq = seqs.front(); qual = quals.front();
ids.pop(); seqs.pop(); quals.pop();
} else {
continue;
}
}
// Write output
try {
writeRecord(out_handle, id, seq, qual);
nrecs_read++;
} catch (Exception const &e) {
cerr << "Error writing output";
seq_handle.close();
return 1;
}
} // End single file reading loop
if(!seq_handle.close()) {
cerr << "Problem closing " << infile << endl;
close(out_handle);
return 1;
}
} // End loop over files
close(out_handle);
return 0;
}