-- bioinformatics routines -- Description: read a fasta/fastq file local function readseq(fp) local finished, last = false, nil; return function() local match; if finished then return nil end if (last == nil) then -- the first record or a record following a fastq for l in fp:lines() do if l:byte(1) == 62 or l:byte(1) == 64 then -- ">" || "@" last = l; break; end end if last == nil then finished = true; return nil; end end local tmp = last:find("%s"); name = (tmp and last:sub(2, tmp-1)) or last:sub(2); -- sequence name local seqs = {}; local c; -- the first character of the last line last = nil; for l in fp:lines() do -- read sequence c = l:byte(1); if c == 62 or c == 64 or c == 43 then last = l; break; end table.insert(seqs, l); end if last == nil then finished = true end -- end of file if c ~= 43 then return name, table.concat(seqs) end -- a fasta record local seq, len = table.concat(seqs), 0; -- prepare to parse quality seqs = {}; for l in fp:lines() do -- read quality table.insert(seqs, l); len = len + #l; if len >= #seq then last = nil; return name, seq, table.concat(seqs); end end finished = true; return name, seq; end end -- extract subsequence from a fasta file indexe by samtools faidx local function faidxsub(fn) local fpidx = io.open(fn .. ".fai"); if fpidx == nil then io.stderr:write("[faidxsub] fail to open the FASTA index file.\n"); return nil end local idx = {}; for l in fpidx:lines() do local name, len, offset, line_blen, line_len = l:match("(%S+)%s(%d+)%s(%d+)%s(%d+)%s(%d+)"); if name then idx[name] = {tonumber(len), offset, line_blen, line_len}; end end fpidx:close(); local fp = io.open(fn); return function(name, beg_, end_) -- 0-based coordinate if name == nil then fp:close(); return nil; end if idx[name] then local a = idx[name]; beg_ = beg_ or 0; end_ = end_ or a[1]; end_ = (end_ <= a[1] and end_) or a[1]; local fb, fe = math.floor(beg_ / a[3]), math.floor(end_ / a[3]); local qb, qe = beg_ - fb * a[3], end_ - fe * a[3]; fp:seek("set", a[2] + fb * a[4] + qb); local s = fp:read((fe - fb) * a[4] + (qe - qb)):gsub("%s", ""); return s; end end end --Description: Index a list of intervals and test if a given interval overlaps with the list --Example: lua -lbio -e 'a={{100,201},{200,300},{400,600}};f=bio.intvovlp(a);print(f(600,700))' --[[ By default, we keep for each tiling 8192 window the interval overlaping the window while having the smallest start position. This method may not work well when most intervals are small but few intervals span a long distance. ]]-- local function intvovlp(intv, bits) bits = bits or 13 -- the default bin size is 8192 = 1<<13 table.sort(intv, function(a,b) return a[1] < b[1] end) -- sort by the start -- merge intervals; the step speeds up testing, but can be skipped local b, e, k = -1, -1, 1 for i = 1, #intv do if e < intv[i][1] then if e >= 0 then intv[k], k = {b, e}, k + 1 end b, e = intv[i][1], intv[i][2] else e = intv[i][2] end end if e >= 0 then intv[k] = {b, e} end while #a > k do table.remove(a) end -- truncate the interval list -- build the index for the list of intervals local idx, size, max = {}, math.pow(2, bits), 0 for i = 1, #a do b = math.modf(intv[i][1] / size) e = math.modf(intv[i][2] / size) if b == e then idx[b] = idx[b] or i else for j = b, e do idx[j] = idx[j] or i end end max = (max > e and max) or e end -- return a function (closure) return function(_beg, _end) local x = math.modf(_beg / size) if x > max then return false end local off = idx[x]; -- the start bin if off == nil then -- the following is not the best in efficiency for i = x - 1, 0, -1 do -- find the minimum bin with a value if idx[i] ~= nil then off = idx[i]; break; end end if off == nil then return false end end for i = off, #intv do -- start from off and search for overlaps if intv[i][1] >= _end then return false elseif intv[i][2] > _beg then return true end end return false end end bio = { readseq = readseq, faidxsub = faidxsub, intvovlp = intvovlp } bio.nt16 = { [0]=15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 } bio.ntcnt = { [0]=4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 } bio.ntcomp = { [0]=0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 } bio.ntrev = 'XACMGRSVTWYHKDBN'