1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
|
-- bioinformatics routines
-- Description: read a fasta/fastq file
local function readseq(fp)
local finished, last = false, nil;
return function()
local match;
if finished then return nil end
if (last == nil) then -- the first record or a record following a fastq
for l in fp:lines() do
if l:byte(1) == 62 or l:byte(1) == 64 then -- ">" || "@"
last = l;
break;
end
end
if last == nil then
finished = true;
return nil;
end
end
local tmp = last:find("%s");
name = (tmp and last:sub(2, tmp-1)) or last:sub(2); -- sequence name
local seqs = {};
local c; -- the first character of the last line
last = nil;
for l in fp:lines() do -- read sequence
c = l:byte(1);
if c == 62 or c == 64 or c == 43 then
last = l;
break;
end
table.insert(seqs, l);
end
if last == nil then finished = true end -- end of file
if c ~= 43 then return name, table.concat(seqs) end -- a fasta record
local seq, len = table.concat(seqs), 0; -- prepare to parse quality
seqs = {};
for l in fp:lines() do -- read quality
table.insert(seqs, l);
len = len + #l;
if len >= #seq then
last = nil;
return name, seq, table.concat(seqs);
end
end
finished = true;
return name, seq;
end
end
-- extract subsequence from a fasta file indexe by samtools faidx
local function faidxsub(fn)
local fpidx = io.open(fn .. ".fai");
if fpidx == nil then
io.stderr:write("[faidxsub] fail to open the FASTA index file.\n");
return nil
end
local idx = {};
for l in fpidx:lines() do
local name, len, offset, line_blen, line_len = l:match("(%S+)%s(%d+)%s(%d+)%s(%d+)%s(%d+)");
if name then
idx[name] = {tonumber(len), offset, line_blen, line_len};
end
end
fpidx:close();
local fp = io.open(fn);
return function(name, beg_, end_) -- 0-based coordinate
if name == nil then fp:close(); return nil; end
if idx[name] then
local a = idx[name];
beg_ = beg_ or 0;
end_ = end_ or a[1];
end_ = (end_ <= a[1] and end_) or a[1];
local fb, fe = math.floor(beg_ / a[3]), math.floor(end_ / a[3]);
local qb, qe = beg_ - fb * a[3], end_ - fe * a[3];
fp:seek("set", a[2] + fb * a[4] + qb);
local s = fp:read((fe - fb) * a[4] + (qe - qb)):gsub("%s", "");
return s;
end
end
end
--Description: Index a list of intervals and test if a given interval overlaps with the list
--Example: lua -lbio -e 'a={{100,201},{200,300},{400,600}};f=bio.intvovlp(a);print(f(600,700))'
--[[
By default, we keep for each tiling 8192 window the interval overlaping the
window while having the smallest start position. This method may not work
well when most intervals are small but few intervals span a long distance.
]]--
local function intvovlp(intv, bits)
bits = bits or 13 -- the default bin size is 8192 = 1<<13
table.sort(intv, function(a,b) return a[1] < b[1] end) -- sort by the start
-- merge intervals; the step speeds up testing, but can be skipped
local b, e, k = -1, -1, 1
for i = 1, #intv do
if e < intv[i][1] then
if e >= 0 then intv[k], k = {b, e}, k + 1 end
b, e = intv[i][1], intv[i][2]
else e = intv[i][2] end
end
if e >= 0 then intv[k] = {b, e} end
while #a > k do table.remove(a) end -- truncate the interval list
-- build the index for the list of intervals
local idx, size, max = {}, math.pow(2, bits), 0
for i = 1, #a do
b = math.modf(intv[i][1] / size)
e = math.modf(intv[i][2] / size)
if b == e then idx[b] = idx[b] or i
else for j = b, e do idx[j] = idx[j] or i end end
max = (max > e and max) or e
end
-- return a function (closure)
return function(_beg, _end)
local x = math.modf(_beg / size)
if x > max then return false end
local off = idx[x]; -- the start bin
if off == nil then -- the following is not the best in efficiency
for i = x - 1, 0, -1 do -- find the minimum bin with a value
if idx[i] ~= nil then off = idx[i]; break; end
end
if off == nil then return false end
end
for i = off, #intv do -- start from off and search for overlaps
if intv[i][1] >= _end then return false
elseif intv[i][2] > _beg then return true end
end
return false
end
end
bio = {
readseq = readseq,
faidxsub = faidxsub,
intvovlp = intvovlp
}
bio.nt16 = {
[0]=15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15,
15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
}
bio.ntcnt = { [0]=4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }
bio.ntcomp = { [0]=0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }
bio.ntrev = 'XACMGRSVTWYHKDBN'
|