diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-03-09 13:19:22 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-03-09 13:19:22 +0000 |
commit | c21c3b0befeb46a51b6bf3758ffa30813bea0ff0 (patch) | |
tree | 9754ff1ca740f6346cf8483ec915d4054bc5da2d /web/server/h2o/libh2o/deps/klib/lua/bio.lua | |
parent | Adding upstream version 1.43.2. (diff) | |
download | netdata-c21c3b0befeb46a51b6bf3758ffa30813bea0ff0.tar.xz netdata-c21c3b0befeb46a51b6bf3758ffa30813bea0ff0.zip |
Adding upstream version 1.44.3.upstream/1.44.3
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'web/server/h2o/libh2o/deps/klib/lua/bio.lua')
-rw-r--r-- | web/server/h2o/libh2o/deps/klib/lua/bio.lua | 149 |
1 files changed, 149 insertions, 0 deletions
diff --git a/web/server/h2o/libh2o/deps/klib/lua/bio.lua b/web/server/h2o/libh2o/deps/klib/lua/bio.lua new file mode 100644 index 000000000..c9f220059 --- /dev/null +++ b/web/server/h2o/libh2o/deps/klib/lua/bio.lua @@ -0,0 +1,149 @@ +-- bioinformatics routines + +-- Description: read a fasta/fastq file +local function readseq(fp) + local finished, last = false, nil; + return function() + local match; + if finished then return nil end + if (last == nil) then -- the first record or a record following a fastq + for l in fp:lines() do + if l:byte(1) == 62 or l:byte(1) == 64 then -- ">" || "@" + last = l; + break; + end + end + if last == nil then + finished = true; + return nil; + end + end + local tmp = last:find("%s"); + name = (tmp and last:sub(2, tmp-1)) or last:sub(2); -- sequence name + local seqs = {}; + local c; -- the first character of the last line + last = nil; + for l in fp:lines() do -- read sequence + c = l:byte(1); + if c == 62 or c == 64 or c == 43 then + last = l; + break; + end + table.insert(seqs, l); + end + if last == nil then finished = true end -- end of file + if c ~= 43 then return name, table.concat(seqs) end -- a fasta record + local seq, len = table.concat(seqs), 0; -- prepare to parse quality + seqs = {}; + for l in fp:lines() do -- read quality + table.insert(seqs, l); + len = len + #l; + if len >= #seq then + last = nil; + return name, seq, table.concat(seqs); + end + end + finished = true; + return name, seq; + end +end + +-- extract subsequence from a fasta file indexe by samtools faidx +local function faidxsub(fn) + local fpidx = io.open(fn .. ".fai"); + if fpidx == nil then + io.stderr:write("[faidxsub] fail to open the FASTA index file.\n"); + return nil + end + local idx = {}; + for l in fpidx:lines() do + local name, len, offset, line_blen, line_len = l:match("(%S+)%s(%d+)%s(%d+)%s(%d+)%s(%d+)"); + if name then + idx[name] = {tonumber(len), offset, line_blen, line_len}; + end + end + fpidx:close(); + local fp = io.open(fn); + return function(name, beg_, end_) -- 0-based coordinate + if name == nil then fp:close(); return nil; end + if idx[name] then + local a = idx[name]; + beg_ = beg_ or 0; + end_ = end_ or a[1]; + end_ = (end_ <= a[1] and end_) or a[1]; + local fb, fe = math.floor(beg_ / a[3]), math.floor(end_ / a[3]); + local qb, qe = beg_ - fb * a[3], end_ - fe * a[3]; + fp:seek("set", a[2] + fb * a[4] + qb); + local s = fp:read((fe - fb) * a[4] + (qe - qb)):gsub("%s", ""); + return s; + end + end +end + +--Description: Index a list of intervals and test if a given interval overlaps with the list +--Example: lua -lbio -e 'a={{100,201},{200,300},{400,600}};f=bio.intvovlp(a);print(f(600,700))' +--[[ + By default, we keep for each tiling 8192 window the interval overlaping the + window while having the smallest start position. This method may not work + well when most intervals are small but few intervals span a long distance. +]]-- +local function intvovlp(intv, bits) + bits = bits or 13 -- the default bin size is 8192 = 1<<13 + table.sort(intv, function(a,b) return a[1] < b[1] end) -- sort by the start + -- merge intervals; the step speeds up testing, but can be skipped + local b, e, k = -1, -1, 1 + for i = 1, #intv do + if e < intv[i][1] then + if e >= 0 then intv[k], k = {b, e}, k + 1 end + b, e = intv[i][1], intv[i][2] + else e = intv[i][2] end + end + if e >= 0 then intv[k] = {b, e} end + while #a > k do table.remove(a) end -- truncate the interval list + -- build the index for the list of intervals + local idx, size, max = {}, math.pow(2, bits), 0 + for i = 1, #a do + b = math.modf(intv[i][1] / size) + e = math.modf(intv[i][2] / size) + if b == e then idx[b] = idx[b] or i + else for j = b, e do idx[j] = idx[j] or i end end + max = (max > e and max) or e + end + -- return a function (closure) + return function(_beg, _end) + local x = math.modf(_beg / size) + if x > max then return false end + local off = idx[x]; -- the start bin + if off == nil then -- the following is not the best in efficiency + for i = x - 1, 0, -1 do -- find the minimum bin with a value + if idx[i] ~= nil then off = idx[i]; break; end + end + if off == nil then return false end + end + for i = off, #intv do -- start from off and search for overlaps + if intv[i][1] >= _end then return false + elseif intv[i][2] > _beg then return true end + end + return false + end +end + +bio = { + readseq = readseq, + faidxsub = faidxsub, + intvovlp = intvovlp +} + +bio.nt16 = { +[0]=15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +} +bio.ntcnt = { [0]=4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 } +bio.ntcomp = { [0]=0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 } +bio.ntrev = 'XACMGRSVTWYHKDBN' |