summaryrefslogtreecommitdiffstats
path: root/web/server/h2o/libh2o/deps/klib/lua/bio.lua
blob: c9f2200594e87414ceea0e2e7ed1000beb3797be (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
-- bioinformatics routines

-- Description: read a fasta/fastq file
local function readseq(fp)
	local finished, last = false, nil;
	return function()
		local match;
		if finished then return nil end
		if (last == nil) then -- the first record or a record following a fastq
			for l in fp:lines() do
				if l:byte(1) == 62 or l:byte(1) == 64 then -- ">" || "@"
					last = l;
					break;
				end
			end
			if last == nil then
				finished = true;
				return nil;
			end
		end
		local tmp = last:find("%s");
		name = (tmp and last:sub(2, tmp-1)) or last:sub(2); -- sequence name
		local seqs = {};
		local c; -- the first character of the last line
		last = nil;
		for l in fp:lines() do -- read sequence
			c = l:byte(1);
			if c == 62 or c == 64 or c == 43 then
				last = l;
				break;
			end
			table.insert(seqs, l);
		end
		if last == nil then finished = true end -- end of file
		if c ~= 43 then return name, table.concat(seqs) end -- a fasta record
		local seq, len = table.concat(seqs), 0; -- prepare to parse quality
		seqs = {};
		for l in fp:lines() do -- read quality
			table.insert(seqs, l);
			len = len + #l;
			if len >= #seq then
				last = nil;
				return name, seq, table.concat(seqs);
			end
		end
		finished = true;
		return name, seq;
	end
end

-- extract subsequence from a fasta file indexe by samtools faidx
local function faidxsub(fn)
	local fpidx = io.open(fn .. ".fai");
	if fpidx == nil then
		io.stderr:write("[faidxsub] fail to open the FASTA index file.\n");
		return nil
	end
	local idx = {};
	for l in fpidx:lines() do
		local name, len, offset, line_blen, line_len = l:match("(%S+)%s(%d+)%s(%d+)%s(%d+)%s(%d+)");
		if name then
			idx[name] = {tonumber(len), offset, line_blen, line_len};
		end
	end
	fpidx:close();
	local fp = io.open(fn);
	return function(name, beg_, end_) -- 0-based coordinate
		if name == nil then fp:close(); return nil; end
		if idx[name] then
			local a = idx[name];
			beg_ = beg_ or 0;
			end_ = end_ or a[1];
			end_ = (end_ <= a[1] and end_) or a[1];
			local fb, fe = math.floor(beg_ / a[3]), math.floor(end_ / a[3]);
			local qb, qe = beg_ - fb * a[3], end_ - fe * a[3];
			fp:seek("set", a[2] + fb * a[4] + qb);
			local s = fp:read((fe - fb) * a[4] + (qe - qb)):gsub("%s", "");
			return s;
		end
	end
end

--Description: Index a list of intervals and test if a given interval overlaps with the list
--Example: lua -lbio -e 'a={{100,201},{200,300},{400,600}};f=bio.intvovlp(a);print(f(600,700))'
--[[
  By default, we keep for each tiling 8192 window the interval overlaping the
  window while having the smallest start position. This method may not work
  well when most intervals are small but few intervals span a long distance.
]]--
local function intvovlp(intv, bits)
	bits = bits or 13 -- the default bin size is 8192 = 1<<13
	table.sort(intv, function(a,b) return a[1] < b[1] end) -- sort by the start
	-- merge intervals; the step speeds up testing, but can be skipped
	local b, e, k = -1, -1, 1
	for i = 1, #intv do
		if e < intv[i][1] then
			if e >= 0 then intv[k], k = {b, e}, k + 1 end
			b, e = intv[i][1], intv[i][2]
		else e = intv[i][2] end
	end
	if e >= 0 then intv[k] = {b, e} end
	while #a > k do table.remove(a) end -- truncate the interval list
	-- build the index for the list of intervals
	local idx, size, max = {}, math.pow(2, bits), 0
	for i = 1, #a do
		b = math.modf(intv[i][1] / size)
		e = math.modf(intv[i][2] / size)
		if b == e then idx[b] = idx[b] or i
		else for j = b, e do idx[j] = idx[j] or i end end
		max = (max > e and max) or e
	end
	-- return a function (closure)
	return function(_beg, _end)
		local x = math.modf(_beg / size)
		if x > max then return false end
		local off = idx[x]; -- the start bin
		if off == nil then -- the following is not the best in efficiency
			for i = x - 1, 0, -1 do -- find the minimum bin with a value
				if idx[i] ~= nil then off = idx[i]; break; end
			end
			if off == nil then return false end
		end
		for i = off, #intv do -- start from off and search for overlaps
			if intv[i][1] >= _end then return false
			elseif intv[i][2] > _beg then return true end 
		end
		return false
	end
end

bio = {
	readseq = readseq,
	faidxsub = faidxsub,
	intvovlp = intvovlp
}

bio.nt16 = {
[0]=15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
}
bio.ntcnt  = { [0]=4, 1, 1,  2, 1,  2, 2,  3, 1, 2, 2,  3, 2,  3, 3,  4 }
bio.ntcomp = { [0]=0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }
bio.ntrev  = 'XACMGRSVTWYHKDBN'