53 files changed, 10866 insertions, 0 deletions
diff --git a/web/server/h2o/libh2o/deps/klib/.gitignore b/web/server/h2o/libh2o/deps/klib/.gitignore
new file mode 100644
index 000000000..010a8ebe6
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/.gitignore
@@ -0,0 +1,40 @@
+# General
+*.a
+*.dSYM/
+*.la
+*.lo
+*.o
+*.opensdf
+*.orig
+*.sdf
+*.suo
+*.swp
+*.tests
+*.vcxproj.filters
+*.vcxproj.user
+*~
+.git
+TAGS
+
+# Mac/Xcode-specfic
+xcuserdata
+contents.xcworkspacedata
+.DS_Store
+._*
+
+# Test byproducts
+test/kbtree_test
+test/khash_keith
+test/khash_keith2
+test/khash_test
+test/klist_test
+test/kmin_test
+test/kseq_bench
+test/kseq_bench2
+test/kseq_test
+test/ksort_test
+test/ksort_test-stl
+test/kstring_bench
+test/kstring_bench2
+test/kstring_test
+test/kvec_test
diff --git a/web/server/h2o/libh2o/deps/klib/README.md b/web/server/h2o/libh2o/deps/klib/README.md
new file mode 100644
index 000000000..ddd74f470
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/README.md
@@ -0,0 +1,237 @@
+#Klib: a Generic Library in C
+
+##<a name="overview"></a>Overview
+
+Klib is a standalone and lightweight C library distributed under [MIT/X11
+license][1]. Most components are independent of external libraries, except the
+standard C library, and independent of each other. To use a component of this
+library, you only need to copy a couple of files to your source code tree
+without worrying about library dependencies.
+
+Klib strives for efficiency and a small memory footprint. Some components, such
+as khash.h, kbtree.h, ksort.h and kvec.h, are among the most efficient
+implementations of similar algorithms or data structures in all programming
+languages, in terms of both speed and memory use.
+
+A new documentation is available [here](http://attractivechaos.github.io/klib/)
+which includes most information in this README file.
+
+####Common components
+
+* [khash.h][khash]: generic hash table based on [double hashing][2].
+* [kbtree.h][kbtree]: generic search tree based on [B-tree][3].
+* [ksort.h][ksort]: generic sort, including [introsort][4], [merge sort][5], [heap sort][6], [comb sort][7], [Knuth shuffle][8] and the [k-small][9] algorithm.
+* [kseq.h][kseq]: generic stream buffer and a [FASTA][10]/[FASTQ][11] format parser.
+* kvec.h: generic dynamic array.
+* klist.h: generic single-linked list and [memory pool][12].
+* kstring.{h,c}: basic string library.
+* kmath.{h,c}: numerical routines including [MT19937-64][13] [pseudorandom generator][14], basic [nonlinear programming][15] and a few special math functions.
+
+####Components for more specific use cases
+
+* ksa.c: constructing [suffix arrays][16] for strings with multiple sentinels, based on a revised [SAIS algorithm][17].
+* knetfile.{h,c}: random access to remote files on HTTP or FTP.
+* kopen.c: smart stream opening.
+* khmm.{h,c}: basic [HMM][18] library.
+* ksw.(h,c}: Striped [Smith-Waterman algorithm][19].
+* knhx.{h,c}: [Newick tree format][20] parser.
+
+
+##<a name="methodology"></a>Methodology
+
+For the implementation of generic [containers][21], klib extensively uses C
+macros. To use these data structures, we usually need to instantiate methods by
+expanding a long macro. This makes the source code look unusual or even ugly
+and adds difficulty to debugging. Unfortunately, for efficient generic
+programming in C that lacks [template][22], using macros is the only
+solution. Only with macros, we can write a generic container which, once
+instantiated, compete with a type-specific container in efficiency. Some
+generic libraries in C, such as [Glib][23], use the `void*` type to implement
+containers. These implementations are usually slower and use more memory than
+klib (see [this benchmark][31]).
+
+To effectively use klib, it is important to understand how it achieves generic
+programming. We will use the hash table library as an example:
+
+    #include "khash.h"
+    KHASH_MAP_INIT_INT(m32, char)        // instantiate structs and methods
+    int main() {
+        int ret, is_missing;
+        khint_t k;
+        khash_t(m32) *h = kh_init(m32);  // allocate a hash table
+        k = kh_put(m32, h, 5, &ret);     // insert a key to the hash table
+        if (!ret) kh_del(m32, h, k);
+        kh_value(h, k) = 10;             // set the value
+        k = kh_get(m32, h, 10);          // query the hash table
+        is_missing = (k == kh_end(h));   // test if the key is present
+        k = kh_get(m32, h, 5);
+        kh_del(m32, h, k);               // remove a key-value pair
+        for (k = kh_begin(h); k != kh_end(h); ++k)  // traverse
+            if (kh_exist(h, k))          // test if a bucket contains data
+    			kh_value(h, k) = 1;
+        kh_destroy(m32, h);              // deallocate the hash table
+        return 0;
+    }
+
+In this example, the second line instantiates a hash table with `unsigned` as
+the key type and `char` as the value type. `m32` names such a type of hash table.
+All types and functions associated with this name are macros, which will be
+explained later. Macro `kh_init()` initiates a hash table and `kh_destroy()`
+frees it. `kh_put()` inserts a key and returns the iterator (or the position)
+in the hash table. `kh_get()` and `kh_del()` get a key and delete an element,
+respectively. Macro `kh_exist()` tests if an iterator (or a position) is filled
+with data.
+
+An immediate question is this piece of code does not look like a valid C
+program (e.g. lacking semicolon, assignment to an _apparent_ function call and
+_apparent_ undefined `m32` 'variable'). To understand why the code is correct,
+let's go a bit further into the source code of `khash.h`, whose skeleton looks
+like:
+
+    #define KHASH_INIT(name, SCOPE, key_t, val_t, is_map, _hashf, _hasheq) \
+      typedef struct { \
+        int n_buckets, size, n_occupied, upper_bound; \
+        unsigned *flags; \
+        key_t *keys; \
+        val_t *vals; \
+      } kh_##name##_t; \
+      SCOPE inline kh_##name##_t *init_##name() { \
+        return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
+      } \
+      SCOPE inline int get_##name(kh_##name##_t *h, key_t k) \
+      ... \
+      SCOPE inline void destroy_##name(kh_##name##_t *h) { \
+        if (h) { \
+          free(h->keys); free(h->flags); free(h->vals); free(h); \
+        } \
+      }
+    
+    #define _int_hf(key) (unsigned)(key)
+    #define _int_heq(a, b) (a == b)
+    #define khash_t(name) kh_##name##_t
+    #define kh_value(h, k) ((h)->vals[k])
+    #define kh_begin(h, k) 0
+    #define kh_end(h) ((h)->n_buckets)
+    #define kh_init(name) init_##name()
+    #define kh_get(name, h, k) get_##name(h, k)
+    #define kh_destroy(name, h) destroy_##name(h)
+    ...
+    #define KHASH_MAP_INIT_INT(name, val_t) \
+    	KHASH_INIT(name, static, unsigned, val_t, is_map, _int_hf, _int_heq)
+
+`KHASH_INIT()` is a huge macro defining all the structs and methods. When this
+macro is called, all the code inside it will be inserted by the [C
+preprocess][37] to the place where it is called. If the macro is called
+multiple times, multiple copies of the code will be inserted. To avoid naming
+conflict of hash tables with different key-value types, the library uses [token
+concatenation][36], which is a preprocessor feature whereby we can substitute
+part of a symbol based on the parameter of the macro. In the end, the C
+preprocessor will generate the following code and feed it to the compiler
+(macro `kh_exist(h,k)` is a little complex and not expanded for simplicity):
+
+    typedef struct {
+      int n_buckets, size, n_occupied, upper_bound;
+      unsigned *flags;
+      unsigned *keys;
+      char *vals;
+    } kh_m32_t;
+    static inline kh_m32_t *init_m32() {
+      return (kh_m32_t*)calloc(1, sizeof(kh_m32_t));
+    }
+    static inline int get_m32(kh_m32_t *h, unsigned k)
+    ...
+    static inline void destroy_m32(kh_m32_t *h) {
+      if (h) {
+        free(h->keys); free(h->flags); free(h->vals); free(h);
+      }
+    }
+
+	int main() {
+		int ret, is_missing;
+		khint_t k;
+		kh_m32_t *h = init_m32();
+		k = put_m32(h, 5, &ret);
+		if (!ret) del_m32(h, k);
+		h->vals[k] = 10;
+		k = get_m32(h, 10);
+		is_missing = (k == h->n_buckets);
+		k = get_m32(h, 5);
+		del_m32(h, k);
+		for (k = 0; k != h->n_buckets; ++k)
+			if (kh_exist(h, k)) h->vals[k] = 1;
+		destroy_m32(h);
+		return 0;
+	}
+
+This is the C program we know.
+
+From this example, we can see that macros and the C preprocessor plays a key
+role in klib. Klib is fast partly because the compiler knows the key-value
+type at the compile time and is able to optimize the code to the same level
+as type-specific code. A generic library written with `void*` will not get such
+performance boost.
+
+Massively inserting code upon instantiation may remind us of C++'s slow
+compiling speed and huge binary size when STL/boost is in use. Klib is much
+better in this respect due to its small code size and component independency.
+Inserting several hundreds lines of code won't make compiling obviously slower.
+
+##<a name="resources"></a>Resources
+
+* Library documentation, if present, is available in the header files. Examples
+can be found in the [test/][24] directory.
+* **Obsolete** documentation of the hash table library can be found at
+[SourceForge][25]. This README is partly adapted from the old documentation.
+* [Blog post][26] describing the hash table library.
+* [Blog post][27] on why using `void*` for generic programming may be inefficient.
+* [Blog post][28] on the generic stream buffer.
+* [Blog post][29] evaluating the performance of `kvec.h`.
+* [Blog post][30] arguing B-tree may be a better data structure than a binary search tree.
+* [Blog post][31] evaluating the performance of `khash.h` and `kbtree.h` among many other implementations.
+[An older version][33] of the benchmark is also available.
+* [Blog post][34] benchmarking internal sorting algorithms and implementations.
+* [Blog post][32] on the k-small algorithm.
+* [Blog post][35] on the Hooke-Jeeve's algorithm for nonlinear programming.
+
+[1]: http://en.wikipedia.org/wiki/MIT_License
+[2]: http://en.wikipedia.org/wiki/Double_hashing
+[3]: http://en.wikipedia.org/wiki/B-tree
+[4]: http://en.wikipedia.org/wiki/Introsort
+[5]: http://en.wikipedia.org/wiki/Merge_sort
+[6]: http://en.wikipedia.org/wiki/Heapsort
+[7]: http://en.wikipedia.org/wiki/Comb_sort
+[8]: http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+[9]: http://en.wikipedia.org/wiki/Selection_algorithm
+[10]: http://en.wikipedia.org/wiki/FASTA_format
+[11]: http://en.wikipedia.org/wiki/FASTQ_format
+[12]: http://en.wikipedia.org/wiki/Memory_pool
+[13]: http://en.wikipedia.org/wiki/Mersenne_twister
+[14]: http://en.wikipedia.org/wiki/Pseudorandom_generator
+[15]: http://en.wikipedia.org/wiki/Nonlinear_programming
+[16]: http://en.wikipedia.org/wiki/Suffix_array
+[17]: https://sites.google.com/site/yuta256/sais
+[18]: http://en.wikipedia.org/wiki/Hidden_Markov_model
+[19]: http://en.wikipedia.org/wiki/Smith-Waterman_algorithm
+[20]: http://en.wikipedia.org/wiki/Newick_format
+[21]: http://en.wikipedia.org/wiki/Container_(abstract_data_type)
+[22]: http://en.wikipedia.org/wiki/Template_(C%2B%2B)
+[23]: http://en.wikipedia.org/wiki/GLib
+[24]: https://github.com/attractivechaos/klib/tree/master/test
+[25]: http://klib.sourceforge.net/
+[26]: http://attractivechaos.wordpress.com/2008/09/02/implementing-generic-hash-library-in-c/
+[27]: http://attractivechaos.wordpress.com/2008/10/02/using-void-in-generic-c-programming-may-be-inefficient/
+[28]: http://attractivechaos.wordpress.com/2008/10/11/a-generic-buffered-stream-wrapper/
+[29]: http://attractivechaos.wordpress.com/2008/09/19/c-array-vs-c-vector/
+[30]: http://attractivechaos.wordpress.com/2008/09/24/b-tree-vs-binary-search-tree/
+[31]: http://attractivechaos.wordpress.com/2008/10/07/another-look-at-my-old-benchmark/
+[32]: http://attractivechaos.wordpress.com/2008/09/13/calculating-median/
+[33]: http://attractivechaos.wordpress.com/2008/08/28/comparison-of-hash-table-libraries/
+[34]: http://attractivechaos.wordpress.com/2008/08/28/comparison-of-internal-sorting-algorithms/
+[35]: http://attractivechaos.wordpress.com/2008/08/24/derivative-free-optimization-dfo/
+[36]: http://en.wikipedia.org/wiki/C_preprocessor#Token_concatenation
+[37]: http://en.wikipedia.org/wiki/C_preprocessor
+
+[kbtree]: http://attractivechaos.github.io/klib/#KBtree%3A%20generic%20ordered%20map:%5B%5BKBtree%3A%20generic%20ordered%20map%5D%5D
+[khash]: http://attractivechaos.github.io/klib/#Khash%3A%20generic%20hash%20table:%5B%5BKhash%3A%20generic%20hash%20table%5D%5D
+[kseq]: http://attractivechaos.github.io/klib/#Kseq%3A%20stream%20buffer%20and%20FASTA%2FQ%20parser:%5B%5BKseq%3A%20stream%20buffer%20and%20FASTA%2FQ%20parser%5D%5D
+[ksort]: http://attractivechaos.github.io/klib/#Ksort%3A%20sorting%2C%20shuffling%2C%20heap%20and%20k-small:%5B%5BKsort%3A%20sorting%2C%20shuffling%2C%20heap%20and%20k-small%5D%5D
diff --git a/web/server/h2o/libh2o/deps/klib/bgzf.c b/web/server/h2o/libh2o/deps/klib/bgzf.c
new file mode 100644
index 000000000..9833414f9
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/bgzf.c
@@ -0,0 +1,555 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+                 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/types.h>
+#include "bgzf.h"
+
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+typedef knetFile *_bgzf_file_t;
+#define _bgzf_open(fn, mode) knet_open(fn, mode)
+#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode)
+#define _bgzf_close(fp) knet_close(fp)
+#define _bgzf_fileno(fp) ((fp)->fd)
+#define _bgzf_tell(fp) knet_tell(fp)
+#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len)
+#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len)
+#else // ~defined(_USE_KNETFILE)
+#if defined(_WIN32) || defined(_MSC_VER)
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else // ~defined(_WIN32)
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif // ~defined(_WIN32)
+typedef FILE *_bgzf_file_t;
+#define _bgzf_open(fn, mode) fopen(fn, mode)
+#define _bgzf_dopen(fp, mode) fdopen(fp, mode)
+#define _bgzf_close(fp) fclose(fp)
+#define _bgzf_fileno(fp) fileno(fp)
+#define _bgzf_tell(fp) ftello(fp)
+#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp)
+#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp)
+#endif // ~define(_USE_KNETFILE)
+
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+
+/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ | 31|139|  8|  4|              0|  0|255|      6| 66| 67|      2|BLK_LEN|
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+*/
+static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
+
+#ifdef BGZF_CACHE
+typedef struct {
+	int size;
+	uint8_t *block;
+	int64_t end_offset;
+} cache_t;
+#include "khash.h"
+KHASH_MAP_INIT_INT64(cache, cache_t)
+#endif
+
+static inline void packInt16(uint8_t *buffer, uint16_t value)
+{
+	buffer[0] = value;
+	buffer[1] = value >> 8;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+	return buffer[0] | buffer[1] << 8;
+}
+
+static inline void packInt32(uint8_t *buffer, uint32_t value)
+{
+	buffer[0] = value;
+	buffer[1] = value >> 8;
+	buffer[2] = value >> 16;
+	buffer[3] = value >> 24;
+}
+
+static BGZF *bgzf_read_init()
+{
+	BGZF *fp;
+	fp = calloc(1, sizeof(BGZF));
+	fp->open_mode = 'r';
+	fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+#ifdef BGZF_CACHE
+	fp->cache = kh_init(cache);
+#endif
+	return fp;
+}
+
+static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level
+{
+	BGZF *fp;
+	fp = calloc(1, sizeof(BGZF));
+	fp->open_mode = 'w';
+	fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+	fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+	if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+	return fp;
+}
+// get the compress level from the mode string
+static int mode2level(const char *__restrict mode)
+{
+	int i, compress_level = -1;
+	for (i = 0; mode[i]; ++i)
+		if (mode[i] >= '0' && mode[i] <= '9') break;
+	if (mode[i]) compress_level = (int)mode[i] - '0';
+	if (strchr(mode, 'u')) compress_level = 0;
+	return compress_level;
+}
+
+BGZF *bgzf_open(const char *path, const char *mode)
+{
+	BGZF *fp = 0;
+	if (strchr(mode, 'r') || strchr(mode, 'R')) {
+		_bgzf_file_t fpr;
+		if ((fpr = _bgzf_open(path, "r")) == 0) return 0;
+		fp = bgzf_read_init();
+		fp->fp = fpr;
+	} else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+		FILE *fpw;
+		if ((fpw = fopen(path, "w")) == 0) return 0;
+		fp = bgzf_write_init(mode2level(mode));
+		fp->fp = fpw;
+	}
+	return fp;
+}
+
+BGZF *bgzf_dopen(int fd, const char *mode)
+{
+	BGZF *fp = 0;
+	if (strchr(mode, 'r') || strchr(mode, 'R')) {
+		_bgzf_file_t fpr;
+		if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0;
+		fp = bgzf_read_init();
+		fp->fp = fpr;
+	} else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+		FILE *fpw;
+		if ((fpw = fdopen(fd, "w")) == 0) return 0;
+		fp = bgzf_write_init(mode2level(mode));
+		fp->fp = fpw;
+	}
+	return fp;
+}
+
+// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
+static int deflate_block(BGZF *fp, int block_length)
+{
+	uint8_t *buffer = fp->compressed_block;
+	int buffer_size = BGZF_BLOCK_SIZE;
+	int input_length = block_length;
+	int compressed_length = 0;
+	int remaining;
+	uint32_t crc;
+
+	assert(block_length <= BGZF_BLOCK_SIZE); // guaranteed by the caller
+	memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
+	while (1) { // loop to retry for blocks that do not compress enough
+		int status;
+		z_stream zs;
+		zs.zalloc = NULL;
+		zs.zfree = NULL;
+		zs.next_in = fp->uncompressed_block;
+		zs.avail_in = input_length;
+		zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+		zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+		status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer
+		if (status != Z_OK) {
+			fp->errcode |= BGZF_ERR_ZLIB;
+			return -1;
+		}
+		status = deflate(&zs, Z_FINISH);
+		if (status != Z_STREAM_END) { // not compressed enough
+			deflateEnd(&zs); // reset the stream
+			if (status == Z_OK) { // reduce the size and recompress
+				input_length -= 1024;
+				assert(input_length > 0); // logically, this should not happen
+				continue;
+			}
+			fp->errcode |= BGZF_ERR_ZLIB;
+			return -1;
+		}
+		if (deflateEnd(&zs) != Z_OK) {
+			fp->errcode |= BGZF_ERR_ZLIB;
+			return -1;
+		}
+		compressed_length = zs.total_out;
+		compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+		assert(compressed_length <= BGZF_BLOCK_SIZE);
+		break;
+	}
+
+	assert(compressed_length > 0);
+	packInt16((uint8_t*)&buffer[16], compressed_length - 1); // write the compressed_length; -1 to fit 2 bytes
+	crc = crc32(0L, NULL, 0L);
+	crc = crc32(crc, fp->uncompressed_block, input_length);
+	packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+	packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
+
+	remaining = block_length - input_length;
+	if (remaining > 0) {
+		assert(remaining <= input_length);
+		memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining);
+	}
+	fp->block_offset = remaining;
+	return compressed_length;
+}
+
+// Inflate the block in fp->compressed_block into fp->uncompressed_block
+static int inflate_block(BGZF* fp, int block_length)
+{
+	z_stream zs;
+	zs.zalloc = NULL;
+	zs.zfree = NULL;
+	zs.next_in = fp->compressed_block + 18;
+	zs.avail_in = block_length - 16;
+	zs.next_out = fp->uncompressed_block;
+	zs.avail_out = BGZF_BLOCK_SIZE;
+
+	if (inflateInit2(&zs, -15) != Z_OK) {
+		fp->errcode |= BGZF_ERR_ZLIB;
+		return -1;
+	}
+	if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
+		inflateEnd(&zs);
+		fp->errcode |= BGZF_ERR_ZLIB;
+		return -1;
+	}
+	if (inflateEnd(&zs) != Z_OK) {
+		fp->errcode |= BGZF_ERR_ZLIB;
+		return -1;
+	}
+	return zs.total_out;
+}
+
+static int check_header(const uint8_t *header)
+{
+	return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0
+			&& unpackInt16((uint8_t*)&header[10]) == 6
+			&& header[12] == 'B' && header[13] == 'C'
+			&& unpackInt16((uint8_t*)&header[14]) == 2);
+}
+
+#ifdef BGZF_CACHE
+static void free_cache(BGZF *fp)
+{
+	khint_t k;
+	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+	if (fp->open_mode != 'r') return;
+	for (k = kh_begin(h); k < kh_end(h); ++k)
+		if (kh_exist(h, k)) free(kh_val(h, k).block);
+	kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+	khint_t k;
+	cache_t *p;
+	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+	k = kh_get(cache, h, block_address);
+	if (k == kh_end(h)) return 0;
+	p = &kh_val(h, k);
+	if (fp->block_length != 0) fp->block_offset = 0;
+	fp->block_address = block_address;
+	fp->block_length = p->size;
+	memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE);
+	_bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET);
+	return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+	int ret;
+	khint_t k;
+	cache_t *p;
+	khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+	if (BGZF_BLOCK_SIZE >= fp->cache_size) return;
+	if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) {
+		/* A better way would be to remove the oldest block in the
+		 * cache, but here we remove a random one for simplicity. This
+		 * should not have a big impact on performance. */
+		for (k = kh_begin(h); k < kh_end(h); ++k)
+			if (kh_exist(h, k)) break;
+		if (k < kh_end(h)) {
+			free(kh_val(h, k).block);
+			kh_del(cache, h, k);
+		}
+	}
+	k = kh_put(cache, h, fp->block_address, &ret);
+	if (ret == 0) return; // if this happens, a bug!
+	p = &kh_val(h, k);
+	p->size = fp->block_length;
+	p->end_offset = fp->block_address + size;
+	p->block = malloc(BGZF_BLOCK_SIZE);
+	memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE);
+}
+#else
+static void free_cache(BGZF *fp) {}
+static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
+static void cache_block(BGZF *fp, int size) {}
+#endif
+
+int bgzf_read_block(BGZF *fp)
+{
+	uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
+	int count, size = 0, block_length, remaining;
+	int64_t block_address;
+	block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+	if (load_block_from_cache(fp, block_address)) return 0;
+	count = _bgzf_read(fp->fp, header, sizeof(header));
+	if (count == 0) { // no data read
+		fp->block_length = 0;
+		return 0;
+	}
+	if (count != sizeof(header) || !check_header(header)) {
+		fp->errcode |= BGZF_ERR_HEADER;
+		return -1;
+	}
+	size = count;
+	block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+	compressed_block = (uint8_t*)fp->compressed_block;
+	memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+	remaining = block_length - BLOCK_HEADER_LENGTH;
+	count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+	if (count != remaining) {
+		fp->errcode |= BGZF_ERR_IO;
+		return -1;
+	}
+	size += count;
+	if ((count = inflate_block(fp, block_length)) < 0) return -1;
+	if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
+	fp->block_address = block_address;
+	fp->block_length = count;
+	cache_block(fp, size);
+	return 0;
+}
+
+ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length)
+{
+	ssize_t bytes_read = 0;
+	uint8_t *output = data;
+	if (length <= 0) return 0;
+	assert(fp->open_mode == 'r');
+	while (bytes_read < length) {
+		int copy_length, available = fp->block_length - fp->block_offset;
+		uint8_t *buffer;
+		if (available <= 0) {
+			if (bgzf_read_block(fp) != 0) return -1;
+			available = fp->block_length - fp->block_offset;
+			if (available <= 0) break;
+		}
+		copy_length = length - bytes_read < available? length - bytes_read : available;
+		buffer = fp->uncompressed_block;
+		memcpy(output, buffer + fp->block_offset, copy_length);
+		fp->block_offset += copy_length;
+		output += copy_length;
+		bytes_read += copy_length;
+	}
+	if (fp->block_offset == fp->block_length) {
+		fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+		fp->block_offset = fp->block_length = 0;
+	}
+	return bytes_read;
+}
+
+int bgzf_flush(BGZF *fp)
+{
+	assert(fp->open_mode == 'w');
+	while (fp->block_offset > 0) {
+		int block_length;
+		block_length = deflate_block(fp, fp->block_offset);
+		if (block_length < 0) return -1;
+		if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) {
+			fp->errcode |= BGZF_ERR_IO; // possibly truncated file
+			return -1;
+		}
+		fp->block_address += block_length;
+	}
+	return 0;
+}
+
+int bgzf_flush_try(BGZF *fp, ssize_t size)
+{
+	if (fp->block_offset + size > BGZF_BLOCK_SIZE)
+		return bgzf_flush(fp);
+	return -1;
+}
+
+ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length)
+{
+	const uint8_t *input = data;
+	int block_length = BGZF_BLOCK_SIZE, bytes_written;
+	assert(fp->open_mode == 'w');
+	input = data;
+	bytes_written = 0;
+	while (bytes_written < length) {
+		uint8_t* buffer = fp->uncompressed_block;
+		int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written;
+		memcpy(buffer + fp->block_offset, input, copy_length);
+		fp->block_offset += copy_length;
+		input += copy_length;
+		bytes_written += copy_length;
+		if (fp->block_offset == block_length && bgzf_flush(fp)) break;
+	}
+	return bytes_written;
+}
+
+int bgzf_close(BGZF* fp)
+{
+	int ret, count, block_length;
+	if (fp == 0) return -1;
+	if (fp->open_mode == 'w') {
+		if (bgzf_flush(fp) != 0) return -1;
+		block_length = deflate_block(fp, 0); // write an empty block
+		count = fwrite(fp->compressed_block, 1, block_length, fp->fp);
+		if (fflush(fp->fp) != 0) {
+			fp->errcode |= BGZF_ERR_IO;
+			return -1;
+		}
+	}
+	ret = fp->open_mode == 'w'? fclose(fp->fp) : _bgzf_close(fp->fp);
+	if (ret != 0) return -1;
+	free(fp->uncompressed_block);
+	free(fp->compressed_block);
+	free_cache(fp);
+	free(fp);
+	return 0;
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+	if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+	static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
+	uint8_t buf[28];
+	off_t offset;
+	offset = _bgzf_tell((_bgzf_file_t)fp->fp);
+	if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0;
+	_bgzf_read(fp->fp, buf, 28);
+	_bgzf_seek(fp->fp, offset, SEEK_SET);
+	return (memcmp(magic, buf, 28) == 0)? 1 : 0;
+}
+
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+	int block_offset;
+	int64_t block_address;
+
+	if (fp->open_mode != 'r' || where != SEEK_SET) {
+		fp->errcode |= BGZF_ERR_MISUSE;
+		return -1;
+	}
+	block_offset = pos & 0xFFFF;
+	block_address = pos >> 16;
+	if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) {
+		fp->errcode |= BGZF_ERR_IO;
+		return -1;
+	}
+	fp->block_length = 0;  // indicates current block has not been loaded
+	fp->block_address = block_address;
+	fp->block_offset = block_offset;
+	return 0;
+}
+
+int bgzf_is_bgzf(const char *fn)
+{
+	uint8_t buf[16];
+	int n;
+	_bgzf_file_t fp;
+	if ((fp = _bgzf_open(fn, "r")) == 0) return 0;
+	n = _bgzf_read(fp, buf, 16);
+	_bgzf_close(fp);
+	if (n != 16) return 0;
+	return memcmp(g_magic, buf, 16) == 0? 1 : 0;
+}
+
+int bgzf_getc(BGZF *fp)
+{
+	int c;
+	if (fp->block_offset >= fp->block_length) {
+		if (bgzf_read_block(fp) != 0) return -2; /* error */
+		if (fp->block_length == 0) return -1; /* end-of-file */
+	}
+	c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+    if (fp->block_offset == fp->block_length) {
+        fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+        fp->block_offset = 0;
+        fp->block_length = 0;
+    }
+	return c;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+{
+	int l, state = 0;
+	unsigned char *buf = (unsigned char*)fp->uncompressed_block;
+	str->l = 0;
+	do {
+		if (fp->block_offset >= fp->block_length) {
+			if (bgzf_read_block(fp) != 0) { state = -2; break; }
+			if (fp->block_length == 0) { state = -1; break; }
+		}
+		for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+		if (l < fp->block_length) state = 1;
+		l -= fp->block_offset;
+		if (str->l + l + 1 >= str->m) {
+			str->m = str->l + l + 2;
+			kroundup32(str->m);
+			str->s = (char*)realloc(str->s, str->m);
+		}
+		memcpy(str->s + str->l, buf + fp->block_offset, l);
+		str->l += l;
+		fp->block_offset += l + 1;
+		if (fp->block_offset >= fp->block_length) {
+			fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp);
+			fp->block_offset = 0;
+			fp->block_length = 0;
+		} 
+	} while (state == 0);
+	if (str->l == 0 && state < 0) return state;
+	str->s[str->l] = 0;
+	return str->l;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/bgzf.h b/web/server/h2o/libh2o/deps/klib/bgzf.h
new file mode 100644
index 000000000..29fe0e5da
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/bgzf.h
@@ -0,0 +1,196 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+                 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+/* The BGZF library was originally written by Bob Handsaker from the Broad
+ * Institute. It was later improved by the SAMtools developers. */
+
+#ifndef __BGZF_H
+#define __BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#define BGZF_BLOCK_SIZE     0x10000
+#define BGZF_MAX_BLOCK_SIZE 0x10000
+
+#define BGZF_ERR_ZLIB   1
+#define BGZF_ERR_HEADER 2
+#define BGZF_ERR_IO     4
+#define BGZF_ERR_MISUSE 8
+
+typedef struct {
+    int open_mode:8, compress_level:8, errcode:16;
+	int cache_size;
+    int block_length, block_offset;
+    int64_t block_address;
+    void *uncompressed_block, *compressed_block;
+	void *cache; // a pointer to a hash table
+	void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading
+} BGZF;
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/******************
+	 * Basic routines *
+	 ******************/
+
+	/**
+	 * Open an existing file descriptor for reading or writing.
+	 *
+	 * @param fd    file descriptor
+	 * @param mode  mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies
+	 *              the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored.
+	 * @return      BGZF file handler; 0 on error
+	 */
+	BGZF* bgzf_dopen(int fd, const char *mode);
+
+	#define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility
+
+	/**
+	 * Open the specified file for reading or writing.
+	 */
+	BGZF* bgzf_open(const char* path, const char *mode);
+
+	/**
+	 * Close the BGZF and free all associated resources.
+	 *
+	 * @param fp    BGZF file handler
+	 * @return      0 on success and -1 on error
+	 */
+	int bgzf_close(BGZF *fp);
+
+	/**
+	 * Read up to _length_ bytes from the file storing into _data_.
+	 *
+	 * @param fp     BGZF file handler
+	 * @param data   data array to read into
+	 * @param length size of data to read
+	 * @return       number of bytes actually read; 0 on end-of-file and -1 on error
+	 */
+	ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length);
+
+	/**
+	 * Write _length_ bytes from _data_ to the file.
+	 *
+	 * @param fp     BGZF file handler
+	 * @param data   data array to write
+	 * @param length size of data to write
+	 * @return       number of bytes actually written; -1 on error
+	 */
+	ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length);
+
+	/**
+	 * Write the data in the buffer to the file.
+	 */
+	int bgzf_flush(BGZF *fp);
+
+	/**
+	 * Return a virtual file pointer to the current location in the file.
+	 * No interpetation of the value should be made, other than a subsequent
+	 * call to bgzf_seek can be used to position the file at the same point.
+	 * Return value is non-negative on success.
+	 */
+	#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF))
+
+	/**
+	 * Set the file to read from the location specified by _pos_.
+	 *
+	 * @param fp     BGZF file handler
+	 * @param pos    virtual file offset returned by bgzf_tell()
+	 * @param whence must be SEEK_SET
+	 * @return       0 on success and -1 on error
+	 */
+	int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence);
+
+	/**
+	 * Check if the BGZF end-of-file (EOF) marker is present
+	 *
+	 * @param fp    BGZF file handler opened for reading
+	 * @return      1 if EOF is present; 0 if not or on I/O error
+	 */
+	int bgzf_check_EOF(BGZF *fp);
+
+	/**
+	 * Check if a file is in the BGZF format
+	 *
+	 * @param fn    file name
+	 * @return      1 if _fn_ is BGZF; 0 if not or on I/O error
+	 */
+	 int bgzf_is_bgzf(const char *fn);
+
+	/*********************
+	 * Advanced routines *
+	 *********************/
+
+	/**
+	 * Set the cache size. Only effective when compiled with -DBGZF_CACHE.
+	 *
+	 * @param fp    BGZF file handler
+	 * @param size  size of cache in bytes; 0 to disable caching (default)
+	 */
+	void bgzf_set_cache_size(BGZF *fp, int size);
+
+	/**
+	 * Flush the file if the remaining buffer size is smaller than _size_ 
+	 */
+	int bgzf_flush_try(BGZF *fp, ssize_t size);
+
+	/**
+	 * Read one byte from a BGZF file. It is faster than bgzf_read()
+	 * @param fp     BGZF file handler
+	 * @return       byte read; -1 on end-of-file or error
+	 */
+	int bgzf_getc(BGZF *fp);
+
+	/**
+	 * Read one line from a BGZF file. It is faster than bgzf_getc()
+	 *
+	 * @param fp     BGZF file handler
+	 * @param delim  delimitor
+	 * @param str    string to write to; must be initialized
+	 * @return       length of the string; 0 on end-of-file; negative on error
+	 */
+	int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
+
+	/**
+	 * Read the next BGZF block.
+	 */
+	int bgzf_read_block(BGZF *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kbit.h b/web/server/h2o/libh2o/deps/klib/kbit.h
new file mode 100644
index 000000000..3793cf837
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kbit.h
@@ -0,0 +1,30 @@
+#ifndef KBIT_H
+#define KBIT_H
+
+#include <stdint.h>
+
+static inline uint64_t kbi_popcount64(uint64_t y) // standard popcount; from wikipedia
+{
+	y -= ((y >> 1) & 0x5555555555555555ull);
+	y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
+	return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
+}
+
+static inline uint64_t kbi_DNAcount64(uint64_t y, int c) // count #A/C/G/T from a 2-bit encoded integer; from BWA
+{
+	// reduce nucleotide counting to bits counting
+	y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull;
+	// count the number of 1s in y
+	y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull);
+	return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
+}
+
+#ifndef kroundup32 // round a 32-bit integer to the next closet integer; from "bit twiddling hacks"
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kbi_swap
+#define kbi_swap(a, b) (((a) ^= (b)), ((b) ^= (a)), ((a) ^= (b))) // from "bit twiddling hacks"
+#endif
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kbtree.h b/web/server/h2o/libh2o/deps/klib/kbtree.h
new file mode 100644
index 000000000..5ed5330b9
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kbtree.h
@@ -0,0 +1,384 @@
+/*-
+ * Copyright 1997-1999, 2001, John-Mark Gurney.
+ *           2008-2009, Attractive Chaos <attractor@live.co.uk>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __AC_KBTREE_H
+#define __AC_KBTREE_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+typedef struct {
+	int32_t is_internal:1, n:31;
+} kbnode_t;
+
+#define	__KB_KEY(type, x)	((type*)((char*)x + 4))
+#define __KB_PTR(btr, x)	((kbnode_t**)((char*)x + btr->off_ptr))
+
+#define __KB_TREE_T(name)						\
+	typedef struct {							\
+		kbnode_t *root;							\
+		int	off_key, off_ptr, ilen, elen;		\
+		int	n, t;								\
+		int	n_keys, n_nodes;					\
+	} kbtree_##name##_t;
+
+#define __KB_INIT(name, key_t)											\
+	kbtree_##name##_t *kb_init_##name(int size)							\
+	{																	\
+		kbtree_##name##_t *b;											\
+		b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t));	\
+		b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
+		if (b->t < 2) {													\
+			free(b); return 0;											\
+		}																\
+		b->n = 2 * b->t - 1;											\
+		b->off_ptr = 4 + b->n * sizeof(key_t);							\
+		b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
+		b->elen = (b->off_ptr + 3) >> 2 << 2;							\
+		b->root = (kbnode_t*)calloc(1, b->ilen);						\
+		++b->n_nodes;													\
+		return b;														\
+	}
+
+#define __kb_destroy(b) do {											\
+		int i, max = 8;													\
+		kbnode_t *x, **top, **stack = 0;								\
+		if (b) {														\
+			top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*));	\
+			*top++ = (b)->root;											\
+			while (top != stack) {										\
+				x = *--top;												\
+				if (x->is_internal == 0) { free(x); continue; }			\
+				for (i = 0; i <= x->n; ++i)								\
+					if (__KB_PTR(b, x)[i]) {							\
+						if (top - stack == max) {						\
+							max <<= 1;									\
+							stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \
+							top = stack + (max>>1);						\
+						}												\
+						*top++ = __KB_PTR(b, x)[i];						\
+					}													\
+				free(x);												\
+			}															\
+		}																\
+		free(b); free(stack);											\
+	} while (0)
+
+#define __kb_get_first(key_t, b, ret) do {	\
+		kbnode_t *__x = (b)->root;			\
+		while (__KB_PTR(b, __x)[0] != 0)	\
+			__x = __KB_PTR(b, __x)[0];		\
+		(ret) = __KB_KEY(key_t, __x)[0];	\
+	} while (0)
+
+#define __KB_GET_AUX0(name, key_t, __cmp)								\
+	static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+	{																	\
+		int tr, *rr, begin, end, n = x->n >> 1;							\
+		if (x->n == 0) return -1;										\
+		if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) {						\
+			begin = 0; end = n;											\
+		} else { begin = n; end = x->n - 1; }							\
+		rr = r? r : &tr;												\
+		n = end;														\
+		while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \
+		return n;														\
+	}
+
+#define __KB_GET_AUX1(name, key_t, __cmp)								\
+	static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+	{																	\
+		int tr, *rr, begin = 0, end = x->n;								\
+		if (x->n == 0) return -1;										\
+		rr = r? r : &tr;												\
+		while (begin < end) {											\
+			int mid = (begin + end) >> 1;								\
+			if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
+			else end = mid;												\
+		}																\
+		if (begin == x->n) { *rr = 1; return x->n - 1; }				\
+		if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin;	\
+		return begin;													\
+	}
+
+#define __KB_GET(name, key_t)											\
+	static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		int i, r = 0;													\
+		kbnode_t *x = b->root;											\
+		while (x) {														\
+			i = __kb_getp_aux_##name(x, k, &r);							\
+			if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i];		\
+			if (x->is_internal == 0) return 0;							\
+			x = __KB_PTR(b, x)[i + 1];									\
+		}																\
+		return 0;														\
+	}																	\
+	static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		return kb_getp_##name(b, &k);									\
+	}
+
+#define __KB_INTERVAL(name, key_t)										\
+	static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper)	\
+	{																	\
+		int i, r = 0;													\
+		kbnode_t *x = b->root;											\
+		*lower = *upper = 0;											\
+		while (x) {														\
+			i = __kb_getp_aux_##name(x, k, &r);							\
+			if (i >= 0 && r == 0) {										\
+				*lower = *upper = &__KB_KEY(key_t, x)[i];				\
+				return;													\
+			}															\
+			if (i >= 0) *lower = &__KB_KEY(key_t, x)[i];				\
+			if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1];		\
+			if (x->is_internal == 0) return;							\
+			x = __KB_PTR(b, x)[i + 1];									\
+		}																\
+	}																	\
+	static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
+	{																	\
+		kb_intervalp_##name(b, &k, lower, upper);						\
+	}
+
+#define __KB_PUT(name, key_t, __cmp)									\
+	/* x must be an internal node */									\
+	static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
+	{																	\
+		kbnode_t *z;													\
+		z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen);	\
+		++b->n_nodes;													\
+		z->is_internal = y->is_internal;								\
+		z->n = b->t - 1;												\
+		memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
+		if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
+		y->n = b->t - 1;												\
+		memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
+		__KB_PTR(b, x)[i + 1] = z;										\
+		memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
+		__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1];			\
+		++x->n;															\
+	}																	\
+	static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
+	{																	\
+		int i = x->n - 1;												\
+		if (x->is_internal == 0) {										\
+			i = __kb_getp_aux_##name(x, k, 0);							\
+			if (i != x->n - 1)											\
+				memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+			__KB_KEY(key_t, x)[i + 1] = *k;								\
+			++x->n;														\
+		} else {														\
+			i = __kb_getp_aux_##name(x, k, 0) + 1;						\
+			if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) {					\
+				__kb_split_##name(b, x, i, __KB_PTR(b, x)[i]);			\
+				if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i;			\
+			}															\
+			__kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k);				\
+		}																\
+	}																	\
+	static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		kbnode_t *r, *s;												\
+		++b->n_keys;													\
+		r = b->root;													\
+		if (r->n == 2 * b->t - 1) {										\
+			++b->n_nodes;												\
+			s = (kbnode_t*)calloc(1, b->ilen);							\
+			b->root = s; s->is_internal = 1; s->n = 0;					\
+			__KB_PTR(b, s)[0] = r;										\
+			__kb_split_##name(b, s, 0, r);								\
+			r = s;														\
+		}																\
+		__kb_putp_aux_##name(b, r, k);									\
+	}																	\
+	static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		kb_putp_##name(b, &k);											\
+	}
+
+
+#define __KB_DEL(name, key_t)											\
+	static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
+	{																	\
+		int yn, zn, i, r = 0;											\
+		kbnode_t *xp, *y, *z;											\
+		key_t kp;														\
+		if (x == 0) return *k;											\
+		if (s) { /* s can only be 0, 1 or 2 */							\
+			r = x->is_internal == 0? 0 : s == 1? 1 : -1;				\
+			i = s == 1? x->n - 1 : -1;									\
+		} else i = __kb_getp_aux_##name(x, k, &r);						\
+		if (x->is_internal == 0) {										\
+			if (s == 2) ++i;											\
+			kp = __KB_KEY(key_t, x)[i];									\
+			memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+			--x->n;														\
+			return kp;													\
+		}																\
+		if (r == 0) {													\
+			if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) {					\
+				xp = __KB_PTR(b, x)[i];									\
+				kp = __KB_KEY(key_t, x)[i];								\
+				__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
+				return kp;												\
+			} else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) {		\
+				xp = __KB_PTR(b, x)[i + 1];								\
+				kp = __KB_KEY(key_t, x)[i];								\
+				__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
+				return kp;												\
+			} else if (yn == b->t - 1 && zn == b->t - 1) {				\
+				y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1];		\
+				__KB_KEY(key_t, y)[y->n++] = *k;						\
+				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
+				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
+				y->n += z->n;											\
+				memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+				--x->n;													\
+				free(z);												\
+				return __kb_delp_aux_##name(b, y, k, s);				\
+			}															\
+		}																\
+		++i;															\
+		if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) {					\
+			if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) {		\
+				memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
+				if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+				__KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1];		\
+				__KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
+				if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
+				--y->n; ++xp->n;										\
+			} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
+				__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i];	\
+				__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0];			\
+				if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
+				--y->n;													\
+				memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
+				if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
+			} else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
+				__KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1];	\
+				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t));	\
+				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+				y->n += xp->n;											\
+				memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
+				--x->n;													\
+				free(xp);												\
+				xp = y;													\
+			} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
+				__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i];	\
+				memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t));	\
+				if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
+				xp->n += y->n;											\
+				memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+				--x->n;													\
+				free(y);												\
+			}															\
+		}																\
+		return __kb_delp_aux_##name(b, xp, k, s);						\
+	}																	\
+	static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		kbnode_t *x;													\
+		key_t ret;														\
+		ret = __kb_delp_aux_##name(b, b->root, k, 0);					\
+		--b->n_keys;													\
+		if (b->root->n == 0 && b->root->is_internal) {					\
+			--b->n_nodes;												\
+			x = b->root;												\
+			b->root = __KB_PTR(b, x)[0];								\
+			free(x);													\
+		}																\
+		return ret;														\
+	}																	\
+	static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		return kb_delp_##name(b, &k);									\
+	}
+
+typedef struct {
+	kbnode_t *x;
+	int i;
+} __kbstack_t;
+
+#define __kb_traverse(key_t, b, __func) do {							\
+		int __kmax = 8;													\
+		__kbstack_t *__kstack, *__kp;									\
+		__kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \
+		__kp->x = (b)->root; __kp->i = 0;								\
+		for (;;) {														\
+			while (__kp->x && __kp->i <= __kp->x->n) {					\
+				if (__kp - __kstack == __kmax - 1) {					\
+					__kmax <<= 1;										\
+					__kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \
+					__kp = __kstack + (__kmax>>1) - 1;					\
+				}														\
+				(__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
+				++__kp;													\
+			}															\
+			--__kp;														\
+			if (__kp >= __kstack) {										\
+				if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
+				++__kp->i;												\
+			} else break;												\
+		}																\
+		free(__kstack);													\
+	} while (0)
+
+#define KBTREE_INIT(name, key_t, __cmp)			\
+	__KB_TREE_T(name)							\
+	__KB_INIT(name, key_t)						\
+	__KB_GET_AUX1(name, key_t, __cmp)			\
+	__KB_GET(name, key_t)						\
+	__KB_INTERVAL(name, key_t)					\
+	__KB_PUT(name, key_t, __cmp)				\
+	__KB_DEL(name, key_t)
+
+#define KB_DEFAULT_SIZE 512
+
+#define kbtree_t(name) kbtree_##name##_t
+#define kb_init(name, s) kb_init_##name(s)
+#define kb_destroy(name, b) __kb_destroy(b)
+#define kb_get(name, b, k) kb_get_##name(b, k)
+#define kb_put(name, b, k) kb_put_##name(b, k)
+#define kb_del(name, b, k) kb_del_##name(b, k)
+#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
+#define kb_getp(name, b, k) kb_getp_##name(b, k)
+#define kb_putp(name, b, k) kb_putp_##name(b, k)
+#define kb_delp(name, b, k) kb_delp_##name(b, k)
+#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
+
+#define kb_size(b) ((b)->n_keys)
+
+#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
+#define kb_str_cmp(a, b) strcmp(a, b)
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kgraph.h b/web/server/h2o/libh2o/deps/klib/kgraph.h
new file mode 100644
index 000000000..af008ef7e
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kgraph.h
@@ -0,0 +1,79 @@
+#ifndef AC_KGRAPH_H
+#define AC_KGRAPH_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "khash.h"
+#include "kbtree.h"
+
+typedef unsigned kgint_t;
+
+#define kgraph_t(name) kh_##name##_t
+
+#define __KG_BASIC(name, SCOPE, vertex_t, arc_t, ehn) \
+	SCOPE kgraph_t(name) *kg_init_##name(void) { return kh_init(name); } \
+	SCOPE void kg_destroy_##name(kgraph_t(name) *g) { \
+		khint_t k; \
+		if (g == 0) return; \
+		for (k = kh_begin(g); k != kh_end(g); ++k) \
+			if (kh_exist(g, k)) kh_destroy(ehn, kh_val(g, k)._arc); \
+		kh_destroy(name, g); \
+	} \
+	SCOPE vertex_t *kg_get_v_##name(kgraph_t(name) *g, kgint_t v) { \
+		khint_t k = kh_get(name, g, v); \
+		return k == kh_end(g)? 0 : &kh_val(g, k); \
+	} \
+	SCOPE vertex_t *kg_put_v_##name(kgraph_t(name) *g, kgint_t v, int *absent) { \
+		khint_t k; \
+		k = kh_put(name, g, v, absent); \
+		if (*absent) kh_val(g, k)._arc = kh_init(ehn); \
+		return &kh_val(g, k); \
+	} \
+	SCOPE void kg_put_a_##name(kgraph_t(name) *g, kgint_t vbeg, kgint_t vend, int dir, arc_t **pb, arc_t **pe) { \
+		vertex_t *p; \
+		khint_t k; \
+		int absent; \
+		p = kg_put_v_##name(g, vbeg, &absent); \
+		k = kh_put(ehn, p->_arc, vend<<2|dir, &absent); \
+		*pb = &kh_val(p->_arc, k); \
+		p = kg_put_v_##name(g, vend, &absent); \
+		k = kh_put(ehn, p->_arc, vbeg<<2|(~dir&3), &absent); \
+		*pe = &kh_val(p->_arc, k); \
+	} \
+	SCOPE vertex_t *kg_del_v_##name(kgraph_t(name) *g, kgint_t v) { \
+		khint_t k, k0, k2, k3; \
+		khash_t(ehn) *h; \
+		k0 = k = kh_get(name, g, v); \
+		if (k == kh_end(g)) return 0; /* not present in the graph */ \
+		h = kh_val(g, k)._arc; \
+		for (k = kh_begin(h); k != kh_end(h); ++k) /* remove v from its neighbors */ \
+			if (kh_exist(h, k)) { \
+				k2 = kh_get(name, g, kh_key(h, k)>>2); \
+				/* assert(k2 != kh_end(g)); */ \
+				k3 = kh_get(ehn, kh_val(g, k2)._arc, v<<2|(~kh_key(h, k)&3)); \
+				/* assert(k3 != kh_end(kh_val(g, k2)._arc)); */ \
+				kh_del(ehn, kh_val(g, k2)._arc, k3); \
+			} \
+		kh_destroy(ehn, h); \
+		kh_del(name, g, k0); \
+		return &kh_val(g, k0); \
+	}
+
+#define KGRAPH_PRINT(name, SCOPE) \
+	SCOPE void kg_print_##name(kgraph_t(name) *g) { \
+		khint_t k, k2; \
+		for (k = kh_begin(g); k != kh_end(g); ++k) \
+			if (kh_exist(g, k)) { \
+				printf("v %u\n", kh_key(g, k)); \
+				for (k2 = kh_begin(kh_val(g, k)._arc); k2 != kh_end(kh_val(g, k)._arc); ++k2) \
+					if (kh_exist(kh_val(g, k)._arc, k2) && kh_key(g, k) < kh_key(kh_val(g, k)._arc, k2)>>2) \
+						printf("a %u%c%c%u\n", kh_key(g, k), "><"[kh_key(kh_val(g, k)._arc, k2)>>1&1], \
+								"><"[kh_key(kh_val(g, k)._arc, k2)&1], kh_key(kh_val(g, k)._arc, k2)>>2); \
+			} \
+	}
+
+#define KGRAPH_INIT(name, SCOPE, vertex_t, arc_t, ehn) \
+	KHASH_INIT2(name, SCOPE, kgint_t, vertex_t, 1, kh_int_hash_func, kh_int_hash_equal) \
+	__KG_BASIC(name, SCOPE, vertex_t, arc_t, ehn)
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/khash.h b/web/server/h2o/libh2o/deps/klib/khash.h
new file mode 100644
index 000000000..5e55088b2
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/khash.h
@@ -0,0 +1,619 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			kfree((void *)h->keys); kfree(h->flags);					\
+			kfree((void *)h->vals);										\
+			kfree(h);													\
+		}																\
+	}																	\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) { kfree(new_flags); return -1; }		\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) { kfree(new_flags); return -1; }	\
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(h->flags); /* free the working space */				\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/web/server/h2o/libh2o/deps/klib/khmm.c b/web/server/h2o/libh2o/deps/klib/khmm.c
new file mode 100644
index 000000000..711ade5a2
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/khmm.c
@@ -0,0 +1,423 @@
+#include <math.h>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+#include "khmm.h"
+
+// new/delete hmm_par_t
+
+hmm_par_t *hmm_new_par(int m, int n)
+{
+	hmm_par_t *hp;
+	int i;
+	assert(m > 0 && n > 0);
+	hp = (hmm_par_t*)calloc(1, sizeof(hmm_par_t));
+	hp->m = m; hp->n = n;
+	hp->a0 = (FLOAT*)calloc(n, sizeof(FLOAT));
+	hp->a = (FLOAT**)calloc2(n, n, sizeof(FLOAT));
+	hp->e = (FLOAT**)calloc2(m + 1, n, sizeof(FLOAT));
+	hp->ae = (FLOAT**)calloc2((m + 1) * n, n, sizeof(FLOAT));
+	for (i = 0; i != n; ++i) hp->e[m][i] = 1.0;
+	return hp;
+}
+void hmm_delete_par(hmm_par_t *hp)
+{
+	int i;
+	if (hp == 0) return;
+	for (i = 0; i != hp->n; ++i) free(hp->a[i]);
+	for (i = 0; i <= hp->m; ++i) free(hp->e[i]);
+	for (i = 0; i < (hp->m + 1) * hp->n; ++i) free(hp->ae[i]);
+	free(hp->a); free(hp->e); free(hp->a0); free(hp->ae);
+	free(hp);
+}
+
+// new/delete hmm_data_t
+
+hmm_data_t *hmm_new_data(int L, const char *seq, const hmm_par_t *hp)
+{
+	hmm_data_t *hd;
+	hd = (hmm_data_t*)calloc(1, sizeof(hmm_data_t));
+	hd->L = L;
+	hd->seq = (char*)malloc(L + 1);
+	memcpy(hd->seq + 1, seq, L);
+	return hd;
+}
+void hmm_delete_data(hmm_data_t *hd)
+{
+	int i;
+	if (hd == 0) return;
+	for (i = 0; i <= hd->L; ++i) {
+		if (hd->f) free(hd->f[i]);
+		if (hd->b) free(hd->b[i]);
+	}
+	free(hd->f); free(hd->b); free(hd->s); free(hd->v); free(hd->p); free(hd->seq);
+	free(hd);
+}
+
+// new/delete hmm_exp_t
+
+hmm_exp_t *hmm_new_exp(const hmm_par_t *hp)
+{
+	hmm_exp_t *he;
+	assert(hp);
+	he = (hmm_exp_t*)calloc(1, sizeof(hmm_exp_t));
+	he->m = hp->m; he->n = hp->n;
+	he->A0 = (FLOAT*)calloc(hp->n, sizeof(FLOAT));
+	he->A = (FLOAT**)calloc2(hp->n, hp->n, sizeof(FLOAT));
+	he->E = (FLOAT**)calloc2(hp->m + 1, hp->n, sizeof(FLOAT));
+	return he;
+}
+void hmm_delete_exp(hmm_exp_t *he)
+{
+	int i;
+	if (he == 0) return;
+	for (i = 0; i != he->n; ++i) free(he->A[i]);
+	for (i = 0; i <= he->m; ++i) free(he->E[i]);
+	free(he->A); free(he->E); free(he->A0);
+	free(he);
+}
+
+// Viterbi algorithm
+
+FLOAT hmm_Viterbi(const hmm_par_t *hp, hmm_data_t *hd)
+{
+	FLOAT **la, **le, *preV, *curV, max;
+	int **Vmax, max_l; // backtrace matrix
+	int k, l, b, u;
+	
+	if (hd->v) free(hd->v);
+	hd->v = (int*)calloc(hd->L+1, sizeof(int));
+	la = (FLOAT**)calloc2(hp->n, hp->n, sizeof(FLOAT));
+	le = (FLOAT**)calloc2(hp->m + 1, hp->n, sizeof(FLOAT));
+	Vmax = (int**)calloc2(hd->L+1, hp->n, sizeof(int));
+	preV = (FLOAT*)malloc(sizeof(FLOAT) * hp->n);
+	curV = (FLOAT*)malloc(sizeof(FLOAT) * hp->n);
+	for (k = 0; k != hp->n; ++k)
+		for (l = 0; l != hp->n; ++l)
+			la[k][l] = log(hp->a[l][k]); // this is not a bug
+	for (b = 0; b != hp->m; ++b)
+		for (k = 0; k != hp->n; ++k)
+			le[b][k] = log(hp->e[b][k]);
+	for (k = 0; k != hp->n; ++k) le[hp->m][k] = 0.0;
+	// V_k(1)
+	for (k = 0; k != hp->n; ++k) {
+		preV[k] = le[(int)hd->seq[1]][k] + log(hp->a0[k]);
+		Vmax[1][k] = 0;
+	}
+	// all the rest
+	for (u = 2; u <= hd->L; ++u) {
+		FLOAT *tmp, *leu = le[(int)hd->seq[u]];
+		for (k = 0; k != hp->n; ++k) {
+			FLOAT *laa = la[k];
+			for (l = 0, max = -HMM_INF, max_l = -1; l != hp->n; ++l) {
+				if (max < preV[l] + laa[l]) {
+					max = preV[l] + laa[l];
+					max_l = l;
+				}
+			}
+			assert(max_l >= 0); // cannot be zero
+			curV[k] = leu[k] + max;
+			Vmax[u][k] = max_l;
+		}
+		tmp = curV; curV = preV; preV = tmp; // swap
+	}
+	// backtrace
+	for (k = 0, max_l = -1, max = -HMM_INF; k != hp->n; ++k) {
+		if (max < preV[k]) {
+			max = preV[k]; max_l = k;
+		}
+	}
+	assert(max_l >= 0); // cannot be zero
+	hd->v[hd->L] = max_l;
+	for (u = hd->L; u >= 1; --u)
+		hd->v[u-1] = Vmax[u][hd->v[u]];
+	for (k = 0; k != hp->n; ++k) free(la[k]);
+	for (b = 0; b < hp->m; ++b) free(le[b]);
+	for (u = 0; u <= hd->L; ++u) free(Vmax[u]);
+	free(la); free(le); free(Vmax); free(preV); free(curV);
+	hd->status |= HMM_VITERBI;
+	return max;
+}
+
+// forward algorithm
+
+void hmm_forward(const hmm_par_t *hp, hmm_data_t *hd)
+{
+	FLOAT sum, tmp, **at;
+	int u, k, l;
+	int n, m, L;
+	assert(hp && hd);
+	// allocate memory for hd->f and hd->s
+	n = hp->n; m = hp->m; L = hd->L;
+	if (hd->s) free(hd->s);
+	if (hd->f) { 
+		for (k = 0; k <= hd->L; ++k) free(hd->f[k]);
+		free(hd->f);
+	}
+	hd->f = (FLOAT**)calloc2(hd->L+1, hp->n, sizeof(FLOAT));
+	hd->s = (FLOAT*)calloc(hd->L+1, sizeof(FLOAT));
+	hd->status &= ~(unsigned)HMM_FORWARD;
+	// at[][] array helps to improve the cache efficiency
+	at = (FLOAT**)calloc2(n, n, sizeof(FLOAT));
+	// transpose a[][]
+	for (k = 0; k != n; ++k)
+		for (l = 0; l != n; ++l)
+			at[k][l] = hp->a[l][k];
+	// f[0], but it should never be used
+	hd->s[0] = 1.0;
+	for (k = 0; k != n; ++k) hd->f[0][k] = 0.0;
+	// f[1]
+	for (k = 0, sum = 0.0; k != n; ++k)
+		sum += (hd->f[1][k] = hp->a0[k] * hp->e[(int)hd->seq[1]][k]);
+	for (k = 0; k != n; ++k) hd->f[1][k] /= sum;
+	hd->s[1] = sum;
+	// f[2..hmmL], the core loop
+	for (u = 2; u <= L; ++u) {
+		FLOAT *fu = hd->f[u], *fu1 = hd->f[u-1], *eu = hp->e[(int)hd->seq[u]];
+		for (k = 0, sum = 0.0; k != n; ++k) {
+			FLOAT *aa = at[k];
+			for (l = 0, tmp = 0.0; l != n; ++l) tmp += fu1[l] * aa[l];
+			sum += (fu[k] = eu[k] * tmp);
+		}
+		for (k = 0; k != n; ++k) fu[k] /= sum;
+		hd->s[u] = sum;
+	}
+	// free at array
+	for (k = 0; k != hp->n; ++k) free(at[k]);
+	free(at);
+	hd->status |= HMM_FORWARD;
+}
+
+//  precalculate hp->ae
+
+void hmm_pre_backward(hmm_par_t *hp)
+{
+	int m, n, b, k, l;
+	assert(hp);
+	m = hp->m; n = hp->n;
+	for (b = 0; b <= m; ++b) {
+		for (k = 0; k != n; ++k) {
+			FLOAT *p = hp->ae[b * hp->n + k];
+			for (l = 0; l != n; ++l)
+				p[l] = hp->e[b][l] * hp->a[k][l];
+		}
+	}
+}
+
+// backward algorithm
+
+void hmm_backward(const hmm_par_t *hp, hmm_data_t *hd)
+{
+	FLOAT tmp;
+	int k, l, u;
+	int m, n, L;
+	assert(hp && hd);
+	assert(hd->status & HMM_FORWARD);
+	// allocate memory for hd->b
+	m = hp->m; n = hp->n; L = hd->L;
+	if (hd->b) { 
+		for (k = 0; k <= hd->L; ++k) free(hd->b[k]);
+		free(hd->b);
+	}
+	hd->status &= ~(unsigned)HMM_BACKWARD;
+	hd->b = (FLOAT**)calloc2(L+1, hp->n, sizeof(FLOAT));
+	// b[L]
+	for (k = 0; k != hp->n; ++k) hd->b[L][k] = 1.0 / hd->s[L];
+	// b[1..L-1], the core loop
+	for (u = L-1; u >= 1; --u) {
+		FLOAT *bu1 = hd->b[u+1], **p = hp->ae + (int)hd->seq[u+1] * n;
+		for (k = 0; k != n; ++k) {
+			FLOAT *q = p[k];
+			for (l = 0, tmp = 0.0; l != n; ++l) tmp += q[l] * bu1[l];
+			hd->b[u][k] = tmp / hd->s[u];
+		}
+	}
+	hd->status |= HMM_BACKWARD;
+	for (l = 0, tmp = 0.0; l != n; ++l)
+		tmp += hp->a0[l] * hd->b[1][l] * hp->e[(int)hd->seq[1]][l];
+	if (tmp > 1.0 + 1e-6 || tmp < 1.0 - 1e-6) // in theory, tmp should always equal to 1
+		fprintf(stderr, "++ Underflow may have happened (%lg).\n", tmp);
+}
+
+// log-likelihood of the observation
+
+FLOAT hmm_lk(const hmm_data_t *hd)
+{
+    FLOAT sum = 0.0, prod = 1.0;
+	int u, L;
+	L = hd->L;
+	assert(hd->status & HMM_FORWARD);
+	for (u = 1; u <= L; ++u) {
+		prod *= hd->s[u];
+		if (prod < HMM_TINY || prod >= 1.0/HMM_TINY) { // reset
+			sum += log(prod);
+			prod = 1.0;
+		}
+	}
+	sum += log(prod);
+	return sum;
+}
+
+// posterior decoding
+
+void hmm_post_decode(const hmm_par_t *hp, hmm_data_t *hd)
+{
+	int u, k;
+	assert(hd->status && HMM_BACKWARD);
+	if (hd->p) free(hd->p);
+	hd->p = (int*)calloc(hd->L + 1, sizeof(int));
+	for (u = 1; u <= hd->L; ++u) {
+		FLOAT prob, max, *fu = hd->f[u], *bu = hd->b[u], su = hd->s[u];
+		int max_k;
+		for (k = 0, max = -1.0, max_k = -1; k != hp->n; ++k) {
+			if (max < (prob = fu[k] * bu[k] * su)) {
+				max = prob; max_k = k;
+			}
+		}
+		assert(max_k >= 0);
+		hd->p[u] = max_k;
+	}
+	hd->status |= HMM_POSTDEC;
+}
+
+// posterior probability of states
+
+FLOAT hmm_post_state(const hmm_par_t *hp, const hmm_data_t *hd, int u, FLOAT *prob)
+{
+	FLOAT sum = 0.0, ss = hd->s[u], *fu = hd->f[u], *bu = hd->b[u];
+	int k;
+	for (k = 0; k != hp->n; ++k)
+		sum += (prob[k] = fu[k] * bu[k] * ss);
+	return sum; // in theory, this should always equal to 1.0
+}
+
+// expected counts
+
+hmm_exp_t *hmm_expect(const hmm_par_t *hp, const hmm_data_t *hd)
+{
+	int k, l, u, b, m, n;
+	hmm_exp_t *he;
+	assert(hd->status & HMM_BACKWARD);
+	he = hmm_new_exp(hp);
+	// initialization
+	m = hp->m; n = hp->n;
+	for (k = 0; k != n; ++k)
+		for (l = 0; l != n; ++l) he->A[k][l] = HMM_TINY;
+	for (b = 0; b <= m; ++b)
+		for (l = 0; l != n; ++l) he->E[b][l] = HMM_TINY;
+	// calculate A_{kl} and E_k(b), k,l\in[0,n)
+	for (u = 1; u < hd->L; ++u) {
+		FLOAT *fu = hd->f[u], *bu = hd->b[u], *bu1 = hd->b[u+1], ss = hd->s[u];
+		FLOAT *Ec = he->E[(int)hd->seq[u]], **p = hp->ae + (int)hd->seq[u+1] * n;
+		for (k = 0; k != n; ++k) {
+			FLOAT *q = p[k], *AA = he->A[k], fuk = fu[k];
+			for (l = 0; l != n; ++l) // this is cache-efficient
+				AA[l] += fuk * q[l] * bu1[l];
+			Ec[k] += fuk * bu[k] * ss;
+		}
+	}
+	// calculate A0_l
+	for (l = 0; l != n; ++l)
+		he->A0[l] += hp->a0[l] * hp->e[(int)hd->seq[1]][l] * hd->b[1][l];
+	return he;
+}
+
+FLOAT hmm_Q0(const hmm_par_t *hp, hmm_exp_t *he)
+{
+	int k, l, b;
+	FLOAT sum = 0.0;
+	for (k = 0; k != hp->n; ++k) {
+		FLOAT tmp;
+		for (b = 0, tmp = 0.0; b != hp->m; ++b) tmp += he->E[b][k];
+		for (b = 0; b != hp->m; ++b)
+			sum += he->E[b][k] * log(he->E[b][k] / tmp);
+	}
+	for (k = 0; k != hp->n; ++k) {
+		FLOAT tmp, *A = he->A[k];
+		for (l = 0, tmp = 0.0; l != hp->n; ++l) tmp += A[l];
+		for (l = 0; l != hp->n; ++l) sum += A[l] * log(A[l] / tmp);
+	}
+	return (he->Q0 = sum);
+}
+
+// add he0 to he1
+
+void hmm_add_expect(const hmm_exp_t *he0, hmm_exp_t *he1)
+{
+	int b, k, l;
+	assert(he0->m == he1->m && he0->n == he1->n);
+	for (k = 0; k != he1->n; ++k) {
+		he1->A0[k] += he0->A0[k];
+		for (l = 0; l != he1->n; ++l)
+			he1->A[k][l] += he0->A[k][l];
+	}
+	for (b = 0; b != he1->m; ++b) {
+		for (l = 0; l != he1->n; ++l)
+			he1->E[b][l] += he0->E[b][l];
+	}
+}
+
+// the EM-Q function
+
+FLOAT hmm_Q(const hmm_par_t *hp, const hmm_exp_t *he)
+{
+	FLOAT sum = 0.0;
+	int bb, k, l;
+	for (bb = 0; bb != he->m; ++bb) {
+		FLOAT *eb = hp->e[bb], *Eb = he->E[bb];
+		for (k = 0; k != hp->n; ++k) {
+			if (eb[k] <= 0.0) return -HMM_INF;
+			sum += Eb[k] * log(eb[k]);
+		}
+	}
+	for (k = 0; k != he->n; ++k) {
+		FLOAT *Ak = he->A[k], *ak = hp->a[k];
+		for (l = 0; l != he->n; ++l) {
+			if (ak[l] <= 0.0) return -HMM_INF;
+			sum += Ak[l] * log(ak[l]);
+		}
+	}
+	return (sum -= he->Q0);
+}
+
+// simulate sequence
+
+char *hmm_simulate(const hmm_par_t *hp, int L)
+{
+	int i, k, l, b;
+	FLOAT x, y, **et;
+	char *seq;
+	seq = (char*)calloc(L+1, 1);
+	// calculate the transpose of hp->e[][]
+	et = (FLOAT**)calloc2(hp->n, hp->m, sizeof(FLOAT));
+	for (k = 0; k != hp->n; ++k)
+		for (b = 0; b != hp->m; ++b)
+			et[k][b] = hp->e[b][k];
+	// the initial state, drawn from a0[]
+	x = drand48();
+	for (k = 0, y = 0.0; k != hp->n; ++k) {
+		y += hp->a0[k];
+		if (y >= x) break;
+	}
+	// main loop
+	for (i = 0; i != L; ++i) {
+		FLOAT *el, *ak = hp->a[k];
+		x = drand48();
+		for (l = 0, y = 0.0; l != hp->n; ++l) {
+			y += ak[l];
+			if (y >= x) break;
+		}
+		el = et[l];
+		x = drand48();
+		for (b = 0, y = 0.0; b != hp->m; ++b) {
+			y += el[b];
+			if (y >= x) break;
+		} 
+		seq[i] = b;
+		k = l;
+	}
+	for (k = 0; k != hp->n; ++k) free(et[k]);
+	free(et);
+	return seq;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/khmm.h b/web/server/h2o/libh2o/deps/klib/khmm.h
new file mode 100644
index 000000000..d87673b93
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/khmm.h
@@ -0,0 +1,107 @@
+#ifndef AC_SCHMM_H_
+#define AC_SCHMM_H_
+
+/*
+ * Last Modified: 2008-03-10
+ * Version: 0.1.0-8
+ *
+ * 2008-03-10, 0.1.0-8: make icc report two more "VECTORIZED"
+ * 2008-03-10, 0.1.0-7: accelerate for some CPU
+ * 2008-02-07, 0.1.0-6: simulate sequences
+ * 2008-01-15, 0.1.0-5: goodness of fit
+ * 2007-11-20, 0.1.0-4: add function declaration of hmm_post_decode()
+ * 2007-11-09: fix a memory leak
+ */
+
+#include <stdlib.h>
+
+#define HMM_VERSION "0.1.0-7"
+
+#define HMM_FORWARD  0x02
+#define HMM_BACKWARD 0x04
+#define HMM_VITERBI  0x40
+#define HMM_POSTDEC  0x80
+
+#ifndef FLOAT
+#define FLOAT double
+#endif
+#define HMM_TINY     1e-25
+#define HMM_INF      1e300
+
+typedef struct
+{
+	int m, n; // number of symbols, number of states
+	FLOAT **a, **e; // transition matrix and emitting probilities
+	FLOAT **ae; // auxiliary array for acceleration, should be calculated by hmm_pre_backward()
+	FLOAT *a0; // trasition matrix from the start state
+} hmm_par_t;
+
+typedef struct
+{
+	int L;
+	unsigned status;
+	char *seq;
+	FLOAT **f, **b, *s;
+	int *v; // Viterbi path
+	int *p; // posterior decoding
+} hmm_data_t;
+
+typedef struct
+{
+	int m, n;
+	FLOAT Q0, **A, **E, *A0;
+} hmm_exp_t;
+
+typedef struct
+{
+	int l, *obs;
+	FLOAT *thr;
+} hmm_gof_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+	/* initialize and destroy hmm_par_t */
+	hmm_par_t *hmm_new_par(int m, int n);
+	void hmm_delete_par(hmm_par_t *hp);
+	/* initialize and destroy hmm_data_t */
+	hmm_data_t *hmm_new_data(int L, const char *seq, const hmm_par_t *hp);
+	void hmm_delete_data(hmm_data_t *hd);
+	/* initialize and destroy hmm_exp_t */
+	hmm_exp_t *hmm_new_exp(const hmm_par_t *hp);
+	void hmm_delete_exp(hmm_exp_t *he);
+	/* Viterbi, forward and backward algorithms */
+	FLOAT hmm_Viterbi(const hmm_par_t *hp, hmm_data_t *hd);
+	void hmm_pre_backward(hmm_par_t *hp);
+	void hmm_forward(const hmm_par_t *hp, hmm_data_t *hd);
+	void hmm_backward(const hmm_par_t *hp, hmm_data_t *hd);
+	/* log-likelihood of the observations (natural based) */
+	FLOAT hmm_lk(const hmm_data_t *hd);
+	/* posterior probability at the position on the sequence */
+	FLOAT hmm_post_state(const hmm_par_t *hp, const hmm_data_t *hd, int u, FLOAT *prob);
+	/* posterior decoding */
+	void hmm_post_decode(const hmm_par_t *hp, hmm_data_t *hd);
+	/* expected counts of transitions and emissions */
+	hmm_exp_t *hmm_expect(const hmm_par_t *hp, const hmm_data_t *hd);
+	/* add he0 counts to he1 counts*/
+	void hmm_add_expect(const hmm_exp_t *he0, hmm_exp_t *he1);
+	/* the Q function that should be maximized in EM */
+	FLOAT hmm_Q(const hmm_par_t *hp, const hmm_exp_t *he);
+	FLOAT hmm_Q0(const hmm_par_t *hp, hmm_exp_t *he);
+	/* simulate sequences */
+	char *hmm_simulate(const hmm_par_t *hp, int L);
+#ifdef __cplusplus
+}
+#endif
+
+static inline void **calloc2(int n_row, int n_col, int size)
+{
+	char **p;
+	int k;
+	p = (char**)malloc(sizeof(char*) * n_row);
+	for (k = 0; k != n_row; ++k)
+		p[k] = (char*)calloc(n_col, size);
+	return (void**)p;
+}
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/klist.h b/web/server/h2o/libh2o/deps/klib/klist.h
new file mode 100644
index 000000000..8b33f271e
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/klist.h
@@ -0,0 +1,121 @@
+/* The MIT License
+
+   Copyright (c) 2008-2009, by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef _AC_KLIST_H
+#define _AC_KLIST_H
+
+#include <stdlib.h>
+
+#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f)						\
+	typedef struct {													\
+		size_t cnt, n, max;												\
+		kmptype_t **buf;												\
+	} kmp_##name##_t;													\
+	static inline kmp_##name##_t *kmp_init_##name(void) {				\
+		return calloc(1, sizeof(kmp_##name##_t));						\
+	}																	\
+	static inline void kmp_destroy_##name(kmp_##name##_t *mp) {			\
+		size_t k;														\
+		for (k = 0; k < mp->n; ++k) {									\
+			kmpfree_f(mp->buf[k]); free(mp->buf[k]);					\
+		}																\
+		free(mp->buf); free(mp);										\
+	}																	\
+	static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) {		\
+		++mp->cnt;														\
+		if (mp->n == 0) return calloc(1, sizeof(kmptype_t));			\
+		return mp->buf[--mp->n];										\
+	}																	\
+	static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
+		--mp->cnt;														\
+		if (mp->n == mp->max) {											\
+			mp->max = mp->max? mp->max<<1 : 16;							\
+			mp->buf = realloc(mp->buf, sizeof(kmptype_t *) * mp->max);	\
+		}																\
+		mp->buf[mp->n++] = p;											\
+	}
+
+#define kmempool_t(name) kmp_##name##_t
+#define kmp_init(name) kmp_init_##name()
+#define kmp_destroy(name, mp) kmp_destroy_##name(mp)
+#define kmp_alloc(name, mp) kmp_alloc_##name(mp)
+#define kmp_free(name, mp, p) kmp_free_##name(mp, p)
+
+#define KLIST_INIT(name, kltype_t, kmpfree_t)							\
+	struct __kl1_##name {												\
+		kltype_t data;													\
+		struct __kl1_##name *next;										\
+	};																	\
+	typedef struct __kl1_##name kl1_##name;								\
+	KMEMPOOL_INIT(name, kl1_##name, kmpfree_t)							\
+	typedef struct {													\
+		kl1_##name *head, *tail;										\
+		kmp_##name##_t *mp;												\
+		size_t size;													\
+	} kl_##name##_t;													\
+	static inline kl_##name##_t *kl_init_##name(void) {					\
+		kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t));			\
+		kl->mp = kmp_init(name);										\
+		kl->head = kl->tail = kmp_alloc(name, kl->mp);					\
+		kl->head->next = 0;												\
+		return kl;														\
+	}																	\
+	static inline void kl_destroy_##name(kl_##name##_t *kl) {			\
+		kl1_##name *p;													\
+		for (p = kl->head; p != kl->tail; p = p->next)					\
+			kmp_free(name, kl->mp, p);									\
+		kmp_free(name, kl->mp, p);										\
+		kmp_destroy(name, kl->mp);										\
+		free(kl);														\
+	}																	\
+	static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) {		\
+		kl1_##name *q, *p = kmp_alloc(name, kl->mp);					\
+		q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p;	\
+		++kl->size;														\
+		return &q->data;												\
+	}																	\
+	static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \
+		kl1_##name *p;													\
+		if (kl->head->next == 0) return -1;								\
+		--kl->size;														\
+		p = kl->head; kl->head = kl->head->next;						\
+		if (d) *d = p->data;											\
+		kmp_free(name, kl->mp, p);										\
+		return 0;														\
+	}
+
+#define kliter_t(name) kl1_##name
+#define klist_t(name) kl_##name##_t
+#define kl_val(iter) ((iter)->data)
+#define kl_next(iter) ((iter)->next)
+#define kl_begin(kl) ((kl)->head)
+#define kl_end(kl) ((kl)->tail)
+
+#define kl_init(name) kl_init_##name()
+#define kl_destroy(name, kl) kl_destroy_##name(kl)
+#define kl_pushp(name, kl) kl_pushp_##name(kl)
+#define kl_shift(name, kl, d) kl_shift_##name(kl, d)
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kmath.c b/web/server/h2o/libh2o/deps/klib/kmath.c
new file mode 100644
index 000000000..9807b00ee
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kmath.c
@@ -0,0 +1,456 @@
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "kmath.h"
+
+/**************************************
+ *** Pseudo-random number generator ***
+ **************************************/
+
+/* 
+   64-bit Mersenne Twister pseudorandom number generator. Adapted from:
+
+     http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/VERSIONS/C-LANG/mt19937-64.c
+
+   which was written by Takuji Nishimura and Makoto Matsumoto and released
+   under the 3-clause BSD license.
+*/
+
+#define KR_NN 312
+#define KR_MM 156
+#define KR_UM 0xFFFFFFFF80000000ULL /* Most significant 33 bits */
+#define KR_LM 0x7FFFFFFFULL /* Least significant 31 bits */
+
+struct _krand_t {
+	int mti;
+	krint64_t mt[KR_NN];
+};
+
+static void kr_srand0(krint64_t seed, krand_t *kr)
+{
+	kr->mt[0] = seed;
+	for (kr->mti = 1; kr->mti < KR_NN; ++kr->mti) 
+		kr->mt[kr->mti] = 6364136223846793005ULL * (kr->mt[kr->mti - 1] ^ (kr->mt[kr->mti - 1] >> 62)) + kr->mti;
+}
+
+krand_t *kr_srand(krint64_t seed)
+{
+	krand_t *kr;
+	kr = malloc(sizeof(krand_t));
+	kr_srand0(seed, kr);
+	return kr;
+}
+
+krint64_t kr_rand(krand_t *kr)
+{
+	krint64_t x;
+	static const krint64_t mag01[2] = { 0, 0xB5026F5AA96619E9ULL };
+    if (kr->mti >= KR_NN) {
+		int i;
+		if (kr->mti == KR_NN + 1) kr_srand0(5489ULL, kr);
+        for (i = 0; i < KR_NN - KR_MM; ++i) {
+            x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM);
+            kr->mt[i] = kr->mt[i + KR_MM] ^ (x>>1) ^ mag01[(int)(x&1)];
+        }
+        for (; i < KR_NN - 1; ++i) {
+            x = (kr->mt[i] & KR_UM) | (kr->mt[i+1] & KR_LM);
+            kr->mt[i] = kr->mt[i + (KR_MM - KR_NN)] ^ (x>>1) ^ mag01[(int)(x&1)];
+        }
+        x = (kr->mt[KR_NN - 1] & KR_UM) | (kr->mt[0] & KR_LM);
+        kr->mt[KR_NN - 1] = kr->mt[KR_MM - 1] ^ (x>>1) ^ mag01[(int)(x&1)];
+        kr->mti = 0;
+    }
+    x = kr->mt[kr->mti++];
+    x ^= (x >> 29) & 0x5555555555555555ULL;
+    x ^= (x << 17) & 0x71D67FFFEDA60000ULL;
+    x ^= (x << 37) & 0xFFF7EEE000000000ULL;
+    x ^= (x >> 43);
+    return x;
+}
+
+#ifdef _KR_MAIN
+int main(int argc, char *argv[])
+{
+	long i, N = 200000000;
+	krand_t *kr;
+	if (argc > 1) N = atol(argv[1]);
+	kr = kr_srand(11);
+	for (i = 0; i < N; ++i) kr_rand(kr);
+//	for (i = 0; i < N; ++i) lrand48();
+	free(kr);
+	return 0;
+}
+#endif
+
+/******************************
+ *** Non-linear programming ***
+ ******************************/
+
+/* Hooke-Jeeves algorithm for nonlinear minimization
+ 
+   Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and
+   the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the
+   papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM
+   6(6):313-314). The original algorithm was designed by Hooke and
+   Jeeves (ACM 8:212-229). This program is further revised according to
+   Johnson's implementation at Netlib (opt/hooke.c).
+ 
+   Hooke-Jeeves algorithm is very simple and it works quite well on a
+   few examples. However, it might fail to converge due to its heuristic
+   nature. A possible improvement, as is suggested by Johnson, may be to
+   choose a small r at the beginning to quickly approach to the minimum
+   and a large r at later step to hit the minimum.
+ */
+
+static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls)
+{
+	int k, j = *n_calls;
+	double ftmp;
+	for (k = 0; k != n; ++k) {
+		x1[k] += dx[k];
+		ftmp = func(n, x1, data); ++j;
+		if (ftmp < fx1) fx1 = ftmp;
+		else { /* search the opposite direction */
+			dx[k] = 0.0 - dx[k];
+			x1[k] += dx[k] + dx[k];
+			ftmp = func(n, x1, data); ++j;
+			if (ftmp < fx1) fx1 = ftmp;
+			else x1[k] -= dx[k]; /* back to the original x[k] */
+		}
+	}
+	*n_calls = j;
+	return fx1; /* here: fx1=f(n,x1) */
+}
+
+double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls)
+{
+	double fx, fx1, *x1, *dx, radius;
+	int k, n_calls = 0;
+	x1 = (double*)calloc(n, sizeof(double));
+	dx = (double*)calloc(n, sizeof(double));
+	for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */
+		dx[k] = fabs(x[k]) * r;
+		if (dx[k] == 0) dx[k] = r;
+	}
+	radius = r;
+	fx1 = fx = func(n, x, data); ++n_calls;
+	for (;;) {
+		memcpy(x1, x, n * sizeof(double)); /* x1 = x */
+		fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls);
+		while (fx1 < fx) {
+			for (k = 0; k != n; ++k) {
+				double t = x[k];
+				dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]);
+				x[k] = x1[k];
+				x1[k] = x1[k] + x1[k] - t;
+			}
+			fx = fx1;
+			if (n_calls >= max_calls) break;
+			fx1 = func(n, x1, data); ++n_calls;
+			fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls);
+			if (fx1 >= fx) break;
+			for (k = 0; k != n; ++k)
+				if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break;
+			if (k == n) break;
+		}
+		if (radius >= eps) {
+			if (n_calls >= max_calls) break;
+			radius *= r;
+			for (k = 0; k != n; ++k) dx[k] *= r;
+		} else break; /* converge */
+	}
+	free(x1); free(dx);
+	return fx1;
+}
+
+// I copied this function somewhere several years ago with some of my modifications, but I forgot the source.
+double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin)
+{
+	double bound, u, r, q, fu, tmp, fa, fb, fc, c;
+	const double gold1 = 1.6180339887;
+	const double gold2 = 0.3819660113;
+	const double tiny = 1e-20;
+	const int max_iter = 100;
+
+	double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw;
+	int iter;
+
+	fa = func(a, data); fb = func(b, data);
+	if (fb > fa) { // swap, such that f(a) > f(b)
+		tmp = a; a = b; b = tmp;
+		tmp = fa; fa = fb; fb = tmp;
+	}
+	c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation
+	while (fb > fc) {
+		bound = b + 100.0 * (c - b); // the farthest point where we want to go
+		r = (b - a) * (fb - fc);
+		q = (b - c) * (fb - fa);
+		if (fabs(q - r) < tiny) { // avoid 0 denominator
+			tmp = q > r? tiny : 0.0 - tiny;
+		} else tmp = q - r;
+		u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point
+		if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c
+			fu = func(u, data);
+			if (fu < fc) { // (b,u,c) bracket the minimum
+				a = b; b = u; fa = fb; fb = fu;
+				break;
+			} else if (fu > fb) { // (a,b,u) bracket the minimum
+				c = u; fc = fu;
+				break;
+			}
+			u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation
+		} else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound
+			fu = func(u, data);
+			if (fu < fc) { // fb > fc > fu
+				b = c; c = u; u = c + gold1 * (c - b);
+				fb = fc; fc = fu; fu = func(u, data);
+			} else { // (b,c,u) bracket the minimum
+				a = b; b = c; c = u;
+				fa = fb; fb = fc; fc = fu;
+				break;
+			}
+		} else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound
+			u = bound; fu = func(u, data);
+		} else { // u goes the other way around, use golden section extrapolation
+			u = c + gold1 * (c - b); fu = func(u, data);
+		}
+		a = b; b = c; c = u;
+		fa = fb; fb = fc; fc = fu;
+	}
+	if (a > c) u = a, a = c, c = u; // swap
+
+	// now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
+	e = d = 0.0;
+	w = v = b; fv = fw = fb;
+	for (iter = 0; iter != max_iter; ++iter) {
+		mid = 0.5 * (a + c);
+		tol2 = 2.0 * (tol1 = tol * fabs(b) + tiny);
+		if (fabs(b - mid) <= (tol2 - 0.5 * (c - a))) {
+			*xmin = b; return fb; // found
+		}
+		if (fabs(e) > tol1) {
+			// related to parabolic interpolation
+			r = (b - w) * (fb - fv);
+			q = (b - v) * (fb - fw);
+			p = (b - v) * q - (b - w) * r;
+			q = 2.0 * (q - r);
+			if (q > 0.0) p = 0.0 - p;
+			else q = 0.0 - q;
+			eold = e; e = d;
+			if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) {
+				d = gold2 * (e = (b >= mid ? a - b : c - b));
+			} else {
+				d = p / q; u = b + d; // actual parabolic interpolation happens here
+				if (u - a < tol2 || c - u < tol2)
+					d = (mid > b)? tol1 : 0.0 - tol1;
+			}
+		} else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation
+		u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1);
+		fu = func(u, data);
+		if (fu <= fb) { // u is the minimum point so far
+			if (u >= b) a = b;
+			else c = b;
+			v = w; w = b; b = u; fv = fw; fw = fb; fb = fu;
+		} else { // adjust (a,c) and (u,v,w)
+			if (u < b) a = u;
+			else c = u;
+			if (fu <= fw || w == b) {
+				v = w; w = u;
+				fv = fw; fw = fu;
+			} else if (fu <= fv || v == b || v == w) {
+				v = u; fv = fu;
+			}
+		}
+	}
+	*xmin = b;
+	return fb;
+}
+
+/*************************
+ *** Special functions ***
+ *************************/
+
+/* Log gamma function
+ * \log{\Gamma(z)}
+ * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
+ */
+double kf_lgamma(double z)
+{
+	double x = 0;
+	x += 0.1659470187408462e-06 / (z+7);
+	x += 0.9934937113930748e-05 / (z+6);
+	x -= 0.1385710331296526     / (z+5);
+	x += 12.50734324009056      / (z+4);
+	x -= 176.6150291498386      / (z+3);
+	x += 771.3234287757674      / (z+2);
+	x -= 1259.139216722289      / (z+1);
+	x += 676.5203681218835      / z;
+	x += 0.9999999999995183;
+	return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
+}
+
+/* complementary error function
+ * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
+ * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
+ */
+double kf_erfc(double x)
+{
+	const double p0 = 220.2068679123761;
+	const double p1 = 221.2135961699311;
+	const double p2 = 112.0792914978709;
+	const double p3 = 33.912866078383;
+	const double p4 = 6.37396220353165;
+	const double p5 = .7003830644436881;
+	const double p6 = .03526249659989109;
+	const double q0 = 440.4137358247522;
+	const double q1 = 793.8265125199484;
+	const double q2 = 637.3336333788311;
+	const double q3 = 296.5642487796737;
+	const double q4 = 86.78073220294608;
+	const double q5 = 16.06417757920695;
+	const double q6 = 1.755667163182642;
+	const double q7 = .08838834764831844;
+	double expntl, z, p;
+	z = fabs(x) * M_SQRT2;
+	if (z > 37.) return x > 0.? 0. : 2.;
+	expntl = exp(z * z * - .5);
+	if (z < 10. / M_SQRT2) // for small z
+	    p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
+			/ (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
+	else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
+	return x > 0.? 2. * p : 2. * (1. - p);
+}
+
+/* The following computes regularized incomplete gamma functions.
+ * Formulas are taken from Wiki, with additional input from Numerical
+ * Recipes in C (for modified Lentz's algorithm) and AS245
+ * (http://lib.stat.cmu.edu/apstat/245).
+ *
+ * A good online calculator is available at:
+ *
+ *   http://www.danielsoper.com/statcalc/calc23.aspx
+ *
+ * It calculates upper incomplete gamma function, which equals
+ * kf_gammaq(s,z)*tgamma(s).
+ */
+
+#define KF_GAMMA_EPS 1e-14
+#define KF_TINY 1e-290
+
+// regularized lower incomplete gamma function, by series expansion
+static double _kf_gammap(double s, double z)
+{
+	double sum, x;
+	int k;
+	for (k = 1, sum = x = 1.; k < 100; ++k) {
+		sum += (x *= z / (s + k));
+		if (x / sum < KF_GAMMA_EPS) break;
+	}
+	return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
+}
+// regularized upper incomplete gamma function, by continued fraction
+static double _kf_gammaq(double s, double z)
+{
+	int j;
+	double C, D, f;
+	f = 1. + z - s; C = f; D = 0.;
+	// Modified Lentz's algorithm for computing continued fraction
+	// See Numerical Recipes in C, 2nd edition, section 5.2
+	for (j = 1; j < 100; ++j) {
+		double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
+		D = b + a * D;
+		if (D < KF_TINY) D = KF_TINY;
+		C = b + a / C;
+		if (C < KF_TINY) C = KF_TINY;
+		D = 1. / D;
+		d = C * D;
+		f *= d;
+		if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+	}
+	return exp(s * log(z) - z - kf_lgamma(s) - log(f));
+}
+
+double kf_gammap(double s, double z)
+{
+	return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
+}
+
+double kf_gammaq(double s, double z)
+{
+	return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
+}
+
+/* Regularized incomplete beta function. The method is taken from
+ * Numerical Recipe in C, 2nd edition, section 6.4. The following web
+ * page calculates the incomplete beta function, which equals
+ * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
+ *
+ *   http://www.danielsoper.com/statcalc/calc36.aspx
+ */
+static double kf_betai_aux(double a, double b, double x)
+{
+	double C, D, f;
+	int j;
+	if (x == 0.) return 0.;
+	if (x == 1.) return 1.;
+	f = 1.; C = f; D = 0.;
+	// Modified Lentz's algorithm for computing continued fraction
+	for (j = 1; j < 200; ++j) {
+		double aa, d;
+		int m = j>>1;
+		aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
+			: m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
+		D = 1. + aa * D;
+		if (D < KF_TINY) D = KF_TINY;
+		C = 1. + aa / C;
+		if (C < KF_TINY) C = KF_TINY;
+		D = 1. / D;
+		d = C * D;
+		f *= d;
+		if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+	}
+	return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
+}
+double kf_betai(double a, double b, double x)
+{
+	return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
+}
+
+/******************
+ *** Statistics ***
+ ******************/
+
+double km_ks_dist(int na, const double a[], int nb, const double b[]) // a[] and b[] MUST BE sorted
+{
+	int ia = 0, ib = 0;
+	double fa = 0, fb = 0, sup = 0, na1 = 1. / na, nb1 = 1. / nb;
+	while (ia < na || ib < nb) {
+		if (ia == na) fb += nb1, ++ib;
+		else if (ib == nb) fa += na1, ++ia;
+		else if (a[ia] < b[ib]) fa += na1, ++ia;
+		else if (a[ia] > b[ib]) fb += nb1, ++ib;
+		else fa += na1, fb += nb1, ++ia, ++ib;
+		if (sup < fabs(fa - fb)) sup = fabs(fa - fb);
+	}
+	return sup;
+}
+
+#ifdef KF_MAIN
+#include <stdio.h>
+#include "ksort.h"
+KSORT_INIT_GENERIC(double)
+int main(int argc, char *argv[])
+{
+	double x = 5.5, y = 3;
+	double a, b;
+	double xx[] = {0.22, -0.87, -2.39, -1.79, 0.37, -1.54, 1.28, -0.31, -0.74, 1.72, 0.38, -0.17, -0.62, -1.10, 0.30, 0.15, 2.30, 0.19, -0.50, -0.09};
+	double yy[] = {-5.13, -2.19, -2.43, -3.83, 0.50, -3.25, 4.32, 1.63, 5.18, -0.43, 7.11, 4.87, -3.10, -5.81, 3.76, 6.31, 2.58, 0.07, 5.76, 3.50};
+	ks_introsort(double, 20, xx); ks_introsort(double, 20, yy);
+	printf("K-S distance: %f\n", km_ks_dist(20, xx, 20, yy));
+	printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
+	printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
+	a = 2; b = 2; x = 0.5;
+	printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
+	return 0;
+}
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kmath.h b/web/server/h2o/libh2o/deps/klib/kmath.h
new file mode 100644
index 000000000..2c3e77969
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kmath.h
@@ -0,0 +1,53 @@
+#ifndef AC_KMATH_H
+#define AC_KMATH_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/**********************************
+	 * Pseudo-random number generator *
+	 **********************************/
+
+	typedef uint64_t krint64_t;
+
+	struct _krand_t;
+	typedef struct _krand_t krand_t;
+
+	#define kr_drand(_kr) ((kr_rand(_kr) >> 11) * (1.0/9007199254740992.0))
+	#define kr_sample(_kr, _k, _cnt) ((*(_cnt))++ < (_k)? *(_cnt) - 1 : kr_rand(_kr) % *(_cnt))
+
+	krand_t *kr_srand(krint64_t seed);
+	krint64_t kr_rand(krand_t *kr);
+
+	/**************************
+	 * Non-linear programming *
+	 **************************/
+
+	#define KMIN_RADIUS  0.5
+	#define KMIN_EPS     1e-7
+	#define KMIN_MAXCALL 50000
+
+	typedef double (*kmin_f)(int, double*, void*);
+	typedef double (*kmin1_f)(double, void*);
+
+	double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls); // Hooke-Jeeves'
+	double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin); // Brent's 1-dimenssion
+
+	/*********************
+	 * Special functions *
+	 *********************/
+
+	double kf_lgamma(double z); // log gamma function
+	double kf_erfc(double x); // complementary error function
+	double kf_gammap(double s, double z); // regularized lower incomplete gamma function
+	double kf_gammaq(double s, double z); // regularized upper incomplete gamma function
+	double kf_betai(double a, double b, double x); // regularized incomplete beta function
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/knetfile.c b/web/server/h2o/libh2o/deps/klib/knetfile.c
new file mode 100644
index 000000000..158add911
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/knetfile.c
@@ -0,0 +1,628 @@
+/* The MIT License
+
+   Copyright (c) 2008 by Genome Research Ltd (GRL).
+                 2010 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Probably I will not do socket programming in the next few years and
+   therefore I decide to heavily annotate this file, for Linux and
+   Windows as well.  -ac */
+
+#include <time.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#include "knetfile.h"
+
+/* In winsock.h, the type of a socket is SOCKET, which is: "typedef
+ * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
+ * integer -1. In knetfile.c, I use "int" for socket type
+ * throughout. This should be improved to avoid confusion.
+ *
+ * In Linux/Mac, recv() and read() do almost the same thing. You can see
+ * in the header file that netread() is simply an alias of read(). In
+ * Windows, however, they are different and using recv() is mandatory.
+ */
+
+/* This function tests if the file handler is ready for reading (or
+ * writing if is_read==0). */
+static int socket_wait(int fd, int is_read)
+{
+	fd_set fds, *fdr = 0, *fdw = 0;
+	struct timeval tv;
+	int ret;
+	tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+	FD_ZERO(&fds);
+	FD_SET(fd, &fds);
+	if (is_read) fdr = &fds;
+	else fdw = &fds;
+	ret = select(fd+1, fdr, fdw, 0, &tv);
+#ifndef _WIN32
+	if (ret == -1) perror("select");
+#else
+	if (ret == 0)
+		fprintf(stderr, "select time-out\n");
+	else if (ret == SOCKET_ERROR)
+		fprintf(stderr, "select: %d\n", WSAGetLastError());
+#endif
+	return ret;
+}
+
+#ifndef _WIN32
+/* This function does not work with Windows due to the lack of
+ * getaddrinfo() in winsock. It is addapted from an example in "Beej's
+ * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+	int on = 1, fd;
+	struct linger lng = { 0, 0 };
+	struct addrinfo hints, *res = 0;
+	memset(&hints, 0, sizeof(struct addrinfo));
+	hints.ai_family = AF_UNSPEC;
+	hints.ai_socktype = SOCK_STREAM;
+	/* In Unix/Mac, getaddrinfo() is the most convenient way to get
+	 * server information. */
+	if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
+	if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+	/* The following two setsockopt() are used by ftplib
+	 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
+	 * necessary. */
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+	if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+	freeaddrinfo(res);
+	return fd;
+}
+#else
+/* MinGW's printf has problem with "%lld" */
+char *int64tostr(char *buf, int64_t x)
+{
+	int cnt;
+	int i = 0;
+	do {
+		buf[i++] = '0' + x % 10;
+		x /= 10;
+	} while (x);
+	buf[i] = 0;
+	for (cnt = i, i = 0; i < cnt/2; ++i) {
+		int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
+	}
+	return buf;
+}
+
+int64_t strtoint64(const char *buf)
+{
+	int64_t x;
+	for (x = 0; *buf != '\0'; ++buf)
+		x = x * 10 + ((int64_t) *buf - 48);
+	return x;
+}
+/* In windows, the first thing is to establish the TCP connection. */
+int knet_win32_init()
+{
+	WSADATA wsaData;
+	return WSAStartup(MAKEWORD(2, 2), &wsaData);
+}
+void knet_win32_destroy()
+{
+	WSACleanup();
+}
+/* A slightly modfied version of the following function also works on
+ * Mac (and presummably Linux). However, this function is not stable on
+ * my Mac. It sometimes works fine but sometimes does not. Therefore for
+ * non-Windows OS, I do not use this one. */
+static SOCKET socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func)										\
+	do {														\
+		fprintf(stderr, "%s: %d\n", func, WSAGetLastError());	\
+		return -1;												\
+	} while (0)
+
+	int on = 1;
+	SOCKET fd;
+	struct linger lng = { 0, 0 };
+	struct sockaddr_in server;
+	struct hostent *hp = 0;
+	// open socket
+	if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
+	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+	// get host info
+	if (isalpha(host[0])) hp = gethostbyname(host);
+	else {
+		struct in_addr addr;
+		addr.s_addr = inet_addr(host);
+		hp = gethostbyaddr((char*)&addr, 4, AF_INET);
+	}
+	if (hp == 0) __err_connect("gethost");
+	// connect
+	server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
+	server.sin_family= AF_INET;
+	server.sin_port = htons(atoi(port));
+	if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
+	// freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
+	return fd;
+}
+#endif
+
+static off_t my_netread(int fd, void *buf, off_t len)
+{
+	off_t rest = len, curr, l = 0;
+	/* recv() and read() may not read the required length of data with
+	 * one call. They have to be called repeatedly. */
+	while (rest) {
+		if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
+		curr = netread(fd, buf + l, rest);
+		/* According to the glibc manual, section 13.2, a zero returned
+		 * value indicates end-of-file (EOF), which should mean that
+		 * read() will not return zero if EOF has not been met but data
+		 * are not immediately available. */
+		if (curr == 0) break;
+		l += curr; rest -= curr;
+	}
+	return l;
+}
+
+/*************************
+ * FTP specific routines *
+ *************************/
+
+static int kftp_get_response(knetFile *ftp)
+{
+#ifndef _WIN32
+	unsigned char c;
+#else
+	char c;
+#endif
+	int n = 0;
+	char *p;
+	if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
+	while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+		//fputc(c, stderr);
+		if (n >= ftp->max_response) {
+			ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
+			ftp->response = (char*)realloc(ftp->response, ftp->max_response);
+		}
+		ftp->response[n++] = c;
+		if (c == '\n') {
+			if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
+				&& ftp->response[3] != '-') break;
+			n = 0;
+			continue;
+		}
+	}
+	if (n < 2) return -1;
+	ftp->response[n-2] = 0;
+	return strtol(ftp->response, &p, 0);
+}
+
+static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
+{
+	if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+	netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
+	return is_get? kftp_get_response(ftp) : 0;
+}
+
+static int kftp_pasv_prep(knetFile *ftp)
+{
+	char *p;
+	int v[6];
+	kftp_send_cmd(ftp, "PASV\r\n", 1);
+	for (p = ftp->response; *p && *p != '('; ++p);
+	if (*p != '(') return -1;
+	++p;
+	sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+	memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
+	ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
+	return 0;
+}
+
+
+static int kftp_pasv_connect(knetFile *ftp)
+{
+	char host[80], port[10];
+	if (ftp->pasv_port == 0) {
+		fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
+		return -1;
+	}
+	sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
+	sprintf(port, "%d", ftp->pasv_port);
+	ftp->fd = socket_connect(host, port);
+	if (ftp->fd == -1) return -1;
+	return 0;
+}
+
+int kftp_connect(knetFile *ftp)
+{
+	ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
+	if (ftp->ctrl_fd == -1) return -1;
+	kftp_get_response(ftp);
+	kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
+	kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
+	kftp_send_cmd(ftp, "TYPE I\r\n", 1);
+	return 0;
+}
+
+int kftp_reconnect(knetFile *ftp)
+{
+	if (ftp->ctrl_fd != -1) {
+		netclose(ftp->ctrl_fd);
+		ftp->ctrl_fd = -1;
+	}
+	netclose(ftp->fd);
+	ftp->fd = -1;
+	return kftp_connect(ftp);
+}
+
+// initialize ->type, ->host, ->retr and ->size
+knetFile *kftp_parse_url(const char *fn, const char *mode)
+{
+	knetFile *fp;
+	char *p;
+	int l;
+	if (strstr(fn, "ftp://") != fn) return 0;
+	for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+	if (*p != '/') return 0;
+	l = p - fn - 6;
+	fp = (knetFile*)calloc(1, sizeof(knetFile));
+	fp->type = KNF_TYPE_FTP;
+	fp->fd = -1;
+	/* the Linux/Mac version of socket_connect() also recognizes a port
+	 * like "ftp", but the Windows version does not. */
+	fp->port = strdup("21");
+	fp->host = (char*)calloc(l + 1, 1);
+	if (strchr(mode, 'c')) fp->no_reconnect = 1;
+	strncpy(fp->host, fn + 6, l);
+	fp->retr = (char*)calloc(strlen(p) + 8, 1);
+	sprintf(fp->retr, "RETR %s\r\n", p);
+    fp->size_cmd = (char*)calloc(strlen(p) + 8, 1);
+    sprintf(fp->size_cmd, "SIZE %s\r\n", p);
+	fp->seek_offset = 0;
+	return fp;
+}
+// place ->fd at offset off
+int kftp_connect_file(knetFile *fp)
+{
+	int ret;
+	long long file_size;
+	if (fp->fd != -1) {
+		netclose(fp->fd);
+		if (fp->no_reconnect) kftp_get_response(fp);
+	}
+	kftp_pasv_prep(fp);
+    kftp_send_cmd(fp, fp->size_cmd, 1);
+#ifndef _WIN32
+    if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
+    {
+        fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
+        return -1;
+    }
+#else
+	const char *p = fp->response;
+	while (*p != ' ') ++p;
+	while (*p < '0' || *p > '9') ++p;
+	file_size = strtoint64(p);
+#endif
+	fp->file_size = file_size;
+	if (fp->offset>=0) {
+		char tmp[32];
+#ifndef _WIN32
+		sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
+#else
+		strcpy(tmp, "REST ");
+		int64tostr(tmp + 5, fp->offset);
+		strcat(tmp, "\r\n");
+#endif
+		kftp_send_cmd(fp, tmp, 1);
+	}
+	kftp_send_cmd(fp, fp->retr, 0);
+	kftp_pasv_connect(fp);
+	ret = kftp_get_response(fp);
+	if (ret != 150) {
+		fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
+		netclose(fp->fd);
+		fp->fd = -1;
+		return -1;
+	}
+	fp->is_ready = 1;
+	return 0;
+}
+
+
+/**************************
+ * HTTP specific routines *
+ **************************/
+
+knetFile *khttp_parse_url(const char *fn, const char *mode)
+{
+	knetFile *fp;
+	char *p, *proxy, *q;
+	int l;
+	if (strstr(fn, "http://") != fn) return 0;
+	// set ->http_host
+	for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+	l = p - fn - 7;
+	fp = (knetFile*)calloc(1, sizeof(knetFile));
+	fp->http_host = (char*)calloc(l + 1, 1);
+	strncpy(fp->http_host, fn + 7, l);
+	fp->http_host[l] = 0;
+	for (q = fp->http_host; *q && *q != ':'; ++q);
+	if (*q == ':') *q++ = 0;
+	// get http_proxy
+	proxy = getenv("http_proxy");
+	// set ->host, ->port and ->path
+	if (proxy == 0) {
+		fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
+		fp->port = strdup(*q? q : "80");
+		fp->path = strdup(*p? p : "/");
+	} else {
+		fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+		for (q = fp->host; *q && *q != ':'; ++q);
+		if (*q == ':') *q++ = 0; 
+		fp->port = strdup(*q? q : "80");
+		fp->path = strdup(fn);
+	}
+	fp->type = KNF_TYPE_HTTP;
+	fp->ctrl_fd = fp->fd = -1;
+	fp->seek_offset = 0;
+	return fp;
+}
+
+int khttp_connect_file(knetFile *fp)
+{
+	int ret, l = 0;
+	char *buf, *p;
+	if (fp->fd != -1) netclose(fp->fd);
+	fp->fd = socket_connect(fp->host, fp->port);
+	buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+	l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
+    l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
+	l += sprintf(buf + l, "\r\n");
+	netwrite(fp->fd, buf, l);
+	l = 0;
+	while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+		if (buf[l] == '\n' && l >= 3)
+			if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+		++l;
+	}
+	buf[l] = 0;
+	if (l < 14) { // prematured header
+		netclose(fp->fd);
+		fp->fd = -1;
+		return -1;
+	}
+	ret = strtol(buf + 8, &p, 0); // HTTP return code
+	if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
+		off_t rest = fp->offset;
+		while (rest) {
+			off_t l = rest < 0x10000? rest : 0x10000;
+			rest -= my_netread(fp->fd, buf, l);
+		}
+	} else if (ret != 206 && ret != 200) {
+		free(buf);
+		fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
+		netclose(fp->fd);
+		fp->fd = -1;
+		return -1;
+	}
+	free(buf);
+	fp->is_ready = 1;
+	return 0;
+}
+
+/********************
+ * Generic routines *
+ ********************/
+
+knetFile *knet_open(const char *fn, const char *mode)
+{
+	knetFile *fp = 0;
+	if (mode[0] != 'r') {
+		fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
+		return 0;
+	}
+	if (strstr(fn, "ftp://") == fn) {
+		fp = kftp_parse_url(fn, mode);
+		if (fp == 0) return 0;
+		if (kftp_connect(fp) == -1) {
+			knet_close(fp);
+			return 0;
+		}
+		kftp_connect_file(fp);
+	} else if (strstr(fn, "http://") == fn) {
+		fp = khttp_parse_url(fn, mode);
+		if (fp == 0) return 0;
+		khttp_connect_file(fp);
+	} else { // local file
+#ifdef _WIN32
+		/* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
+		 * be undefined on some systems, although it is defined on my
+		 * Mac and the Linux I have tested on. */
+		int fd = open(fn, O_RDONLY | O_BINARY);
+#else		
+		int fd = open(fn, O_RDONLY);
+#endif
+		if (fd == -1) {
+			perror("open");
+			return 0;
+		}
+		fp = (knetFile*)calloc(1, sizeof(knetFile));
+		fp->type = KNF_TYPE_LOCAL;
+		fp->fd = fd;
+		fp->ctrl_fd = -1;
+	}
+	if (fp && fp->fd == -1) {
+		knet_close(fp);
+		return 0;
+	}
+	return fp;
+}
+
+knetFile *knet_dopen(int fd, const char *mode)
+{
+	knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
+	fp->type = KNF_TYPE_LOCAL;
+	fp->fd = fd;
+	return fp;
+}
+
+off_t knet_read(knetFile *fp, void *buf, off_t len)
+{
+	off_t l = 0;
+	if (fp->fd == -1) return 0;
+	if (fp->type == KNF_TYPE_FTP) {
+		if (fp->is_ready == 0) {
+			if (!fp->no_reconnect) kftp_reconnect(fp);
+			kftp_connect_file(fp);
+		}
+	} else if (fp->type == KNF_TYPE_HTTP) {
+		if (fp->is_ready == 0)
+			khttp_connect_file(fp);
+	}
+	if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
+		off_t rest = len, curr;
+		while (rest) {
+			do {
+				curr = read(fp->fd, buf + l, rest);
+			} while (curr < 0 && EINTR == errno);
+			if (curr < 0) return -1;
+			if (curr == 0) break;
+			l += curr; rest -= curr;
+		}
+	} else l = my_netread(fp->fd, buf, len);
+	fp->offset += l;
+	return l;
+}
+
+off_t knet_seek(knetFile *fp, int64_t off, int whence)
+{
+	if (whence == SEEK_SET && off == fp->offset) return 0;
+	if (fp->type == KNF_TYPE_LOCAL) {
+		/* Be aware that lseek() returns the offset after seeking,
+		 * while fseek() returns zero on success. */
+		off_t offset = lseek(fp->fd, off, whence);
+		if (offset == -1) {
+            // Be silent, it is OK for knet_seek to fail when the file is streamed
+            // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+			return -1;
+		}
+		fp->offset = offset;
+		return off;
+	} else if (fp->type == KNF_TYPE_FTP) {
+        if (whence==SEEK_CUR)
+            fp->offset += off;
+        else if (whence==SEEK_SET)
+            fp->offset = off;
+        else if ( whence==SEEK_END)
+            fp->offset = fp->file_size+off;
+		fp->is_ready = 0;
+		return off;
+	} else if (fp->type == KNF_TYPE_HTTP) {
+		if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
+			fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
+			errno = ESPIPE;
+			return -1;
+		}
+        if (whence==SEEK_CUR)
+            fp->offset += off;
+        else if (whence==SEEK_SET)
+            fp->offset = off;
+		fp->is_ready = 0;
+		return off;
+	}
+	errno = EINVAL;
+    fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
+	return -1;
+}
+
+int knet_close(knetFile *fp)
+{
+	if (fp == 0) return 0;
+	if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
+	if (fp->fd != -1) {
+		/* On Linux/Mac, netclose() is an alias of close(), but on
+		 * Windows, it is an alias of closesocket(). */
+		if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
+		else netclose(fp->fd);
+	}
+	free(fp->host); free(fp->port);
+	free(fp->response); free(fp->retr); // FTP specific
+	free(fp->path); free(fp->http_host); // HTTP specific
+	free(fp);
+	return 0;
+}
+
+#ifdef KNETFILE_MAIN
+int main(void)
+{
+	char *buf;
+	knetFile *fp;
+	int type = 4, l;
+#ifdef _WIN32
+	knet_win32_init();
+#endif
+	buf = calloc(0x100000, 1);
+	if (type == 0) {
+		fp = knet_open("knetfile.c", "r");
+		knet_seek(fp, 1000, SEEK_SET);
+	} else if (type == 1) { // NCBI FTP, large file
+		fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
+		knet_seek(fp, 2500000000ll, SEEK_SET);
+		l = knet_read(fp, buf, 255);
+	} else if (type == 2) {
+		fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
+		knet_seek(fp, 1000, SEEK_SET);
+	} else if (type == 3) {
+		fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
+		knet_seek(fp, 1000, SEEK_SET);
+	} else if (type == 4) {
+		fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
+		knet_read(fp, buf, 10000);
+		knet_seek(fp, 20000, SEEK_SET);
+		knet_seek(fp, 10000, SEEK_SET);
+		l = knet_read(fp, buf+10000, 10000000) + 10000;
+	}
+	if (type != 4 && type != 1) {
+		knet_read(fp, buf, 255);
+		buf[255] = 0;
+		printf("%s\n", buf);
+	} else write(fileno(stdout), buf, l);
+	knet_close(fp);
+	free(buf);
+	return 0;
+}
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/knetfile.h b/web/server/h2o/libh2o/deps/klib/knetfile.h
new file mode 100644
index 000000000..0a0e66f7a
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/knetfile.h
@@ -0,0 +1,75 @@
+#ifndef KNETFILE_H
+#define KNETFILE_H
+
+#include <stdint.h>
+#include <fcntl.h>
+
+#ifndef _WIN32
+#define netread(fd, ptr, len) read(fd, ptr, len)
+#define netwrite(fd, ptr, len) write(fd, ptr, len)
+#define netclose(fd) close(fd)
+#else
+#include <winsock2.h>
+#define netread(fd, ptr, len) recv(fd, ptr, len, 0)
+#define netwrite(fd, ptr, len) send(fd, ptr, len, 0)
+#define netclose(fd) closesocket(fd)
+#endif
+
+// FIXME: currently I/O is unbuffered
+
+#define KNF_TYPE_LOCAL 1
+#define KNF_TYPE_FTP   2
+#define KNF_TYPE_HTTP  3
+
+typedef struct knetFile_s {
+	int type, fd;
+	int64_t offset;
+	char *host, *port;
+
+	// the following are for FTP only
+	int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready;
+	char *response, *retr, *size_cmd;
+	int64_t seek_offset; // for lazy seek
+    int64_t file_size;
+
+	// the following are for HTTP only
+	char *path, *http_host;
+} knetFile;
+
+#define knet_tell(fp) ((fp)->offset)
+#define knet_fileno(fp) ((fp)->fd)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+	int knet_win32_init();
+	void knet_win32_destroy();
+#endif
+
+	knetFile *knet_open(const char *fn, const char *mode);
+
+	/* 
+	   This only works with local files.
+	 */
+	knetFile *knet_dopen(int fd, const char *mode);
+
+	/*
+	  If ->is_ready==0, this routine updates ->fd; otherwise, it simply
+	  reads from ->fd.
+	 */
+	off_t knet_read(knetFile *fp, void *buf, off_t len);
+
+	/*
+	  This routine only sets ->offset and ->is_ready=0. It does not
+	  communicate with the FTP server.
+	 */
+	off_t knet_seek(knetFile *fp, int64_t off, int whence);
+	int knet_close(knetFile *fp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/knhx.c b/web/server/h2o/libh2o/deps/klib/knhx.c
new file mode 100644
index 000000000..8dbd3b6e3
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/knhx.c
@@ -0,0 +1,166 @@
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "knhx.h"
+
+typedef struct {
+	int error, n, max;
+	knhx1_t *node;
+} knaux_t;
+
+static inline char *add_node(const char *s, knaux_t *aux, int x)
+{
+	char *p, *nbeg, *nend = 0;
+	knhx1_t *r;
+	if (aux->n == aux->max) {
+		aux->max = aux->max? aux->max<<1 : 8;
+		aux->node = (knhx1_t*)realloc(aux->node, sizeof(knhx1_t) * aux->max);
+	}
+	r = aux->node + (aux->n++);
+	r->n = x; r->parent = -1;
+	for (p = (char*)s, nbeg = p, r->d = -1.0; *p && *p != ',' && *p != ')'; ++p) {
+		if (*p == '[') {
+			if (nend == 0) nend = p;
+			do ++p; while (*p && *p != ']');
+			if (*p == 0) {
+				aux->error |= KNERR_BRACKET;
+				break;
+			}
+		} else if (*p == ':') {
+			if (nend == 0) nend = p;
+			r->d = strtod(p + 1, &p);
+			--p;
+		} else if (!isgraph(*p)) if (nend == 0) nend = p;
+	}
+	if (nend == 0) nend = p;
+	if (nend != nbeg) {
+		r->name = (char*)calloc(nend - nbeg + 1, 1);
+		strncpy(r->name, nbeg, nend - nbeg);
+	} else r->name = strdup("");
+	return p;
+}
+
+knhx1_t *kn_parse(const char *nhx, int *_n, int *_error)
+{
+	char *p;
+	int *stack, top, max;
+	knaux_t *aux;
+	knhx1_t *ret;
+
+#define __push_back(y) do {										\
+		if (top == max) {										\
+			max = max? max<<1 : 16;								\
+			stack = (int*)realloc(stack, sizeof(int) * max);	\
+		}														\
+		stack[top++] = (y);										\
+	} while (0)													\
+
+	stack = 0; top = max = 0;
+	p = (char*)nhx;
+	aux = (knaux_t*)calloc(1, sizeof(knaux_t));
+	while (*p) {
+		while (*p && !isgraph(*p)) ++p;
+		if (*p == 0) break;
+		if (*p == ',') ++p;
+		else if (*p == '(') {
+			__push_back(-1);
+			++p;
+		} else if (*p == ')') {
+			int x = aux->n, m, i;
+			for (i = top - 1; i >= 0; --i)
+				if (stack[i] < 0) break;
+			m = top - 1 - i;
+			p = add_node(p + 1, aux, m);
+			aux->node[x].child = (int*)calloc(m, sizeof(int));
+			for (i = top - 1, m = m - 1; m >= 0; --m, --i) {
+				aux->node[x].child[m] = stack[i];
+				aux->node[stack[i]].parent = x;
+			}
+			top = i;
+			__push_back(x);
+		} else {
+			__push_back(aux->n);
+			p = add_node(p, aux, 0);
+		}
+	}
+	*_n = aux->n;
+	*_error = aux->error;
+	ret = aux->node;
+	free(aux); free(stack);
+	return ret;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l; s->s[s->l] = 0;
+	return l;
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+	if (s->l + 1 >= s->m) {
+		s->m = s->l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	s->s[s->l++] = c; s->s[s->l] = 0;
+	return c;
+}
+
+static void format_node_recur(const knhx1_t *node, const knhx1_t *p, kstring_t *s, char *numbuf)
+{
+	if (p->n) {
+		int i;
+		kputc('(', s);
+		for (i = 0; i < p->n; ++i) {
+			if (i) kputc(',', s);
+			format_node_recur(node, &node[p->child[i]], s, numbuf);
+		}
+		kputc(')', s);
+		if (p->name) kputsn(p->name, strlen(p->name), s);
+		if (p->d >= 0) {
+			sprintf(numbuf, ":%g", p->d);
+			kputsn(numbuf, strlen(numbuf), s);
+		}
+	} else kputsn(p->name, strlen(p->name), s);
+}
+
+void kn_format(const knhx1_t *node, int root, kstring_t *s) // TODO: get rid of recursion
+{
+	char numbuf[128];
+	format_node_recur(node, &node[root], s, numbuf);
+}
+
+#ifdef KNHX_MAIN
+int main(int argc, char *argv[])
+{
+	char *s = "((a[abc],d1)x:0.5,((b[&&NHX:S=MOUSE],h2)[&&NHX:S=HUMAN:B=99][blabla][&&NHX:K=foo],c))";
+	knhx1_t *node;
+	int i, j, n, error;
+	kstring_t str;
+	node = kn_parse(s, &n, &error);
+	for (i = 0; i < n; ++i) {
+		knhx1_t *p = node + i;
+		printf("[%d] %s\t%d\t%d\t%g", i, p->name, p->parent, p->n, p->d);
+		for (j = 0; j < p->n; ++j)
+			printf("\t%d", p->child[j]);
+		putchar('\n');
+	}
+	str.l = str.m = 0; str.s = 0;
+	kn_format(node, n-1, &str);
+	puts(str.s);
+	free(str.s);
+	return 0;
+}
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/knhx.h b/web/server/h2o/libh2o/deps/klib/knhx.h
new file mode 100644
index 000000000..dbad7dd94
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/knhx.h
@@ -0,0 +1,35 @@
+#ifndef KNHX_H_
+#define KNHX_H_
+
+#define KNERR_MISSING_LEFT   0x01
+#define KNERR_MISSING_RGHT   0x02
+#define KNERR_BRACKET        0x04
+#define KNERR_COLON          0x08
+
+typedef struct {
+	int parent, n;
+	int *child;
+	char *name;
+	double d;
+} knhx1_t;
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	knhx1_t *kn_parse(const char *nhx, int *_n, int *_error);
+	void kn_format(const knhx1_t *node, int root, kstring_t *s);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kopen.c b/web/server/h2o/libh2o/deps/klib/kopen.c
new file mode 100644
index 000000000..f72735c42
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kopen.c
@@ -0,0 +1,343 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/types.h>
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#ifdef _WIN32
+#define _KO_NO_NET
+#endif
+
+#ifndef _KO_NO_NET
+static int socket_wait(int fd, int is_read)
+{
+	fd_set fds, *fdr = 0, *fdw = 0;
+	struct timeval tv;
+	int ret;
+	tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+	FD_ZERO(&fds);
+	FD_SET(fd, &fds);
+	if (is_read) fdr = &fds;
+	else fdw = &fds;
+	ret = select(fd+1, fdr, fdw, 0, &tv);
+	if (ret == -1) perror("select");
+	return ret;
+}
+
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+	int on = 1, fd;
+	struct linger lng = { 0, 0 };
+	struct addrinfo hints, *res = 0;
+	memset(&hints, 0, sizeof(struct addrinfo));
+	hints.ai_family = AF_UNSPEC;
+	hints.ai_socktype = SOCK_STREAM;
+	if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
+	if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+	if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+	freeaddrinfo(res);
+	return fd;
+#undef __err_connect
+}
+
+static int http_open(const char *fn)
+{
+	char *p, *proxy, *q, *http_host, *host, *port, *path, *buf;
+	int fd, ret, l;
+
+	/* parse URL; adapted from khttp_parse_url() in knetfile.c */
+	if (strstr(fn, "http://") != fn) return 0;
+	// set ->http_host
+	for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+	l = p - fn - 7;
+	http_host = calloc(l + 1, 1);
+	strncpy(http_host, fn + 7, l);
+	http_host[l] = 0;
+	for (q = http_host; *q && *q != ':'; ++q);
+	if (*q == ':') *q++ = 0;
+	// get http_proxy
+	proxy = getenv("http_proxy");
+	// set host, port and path
+	if (proxy == 0) {
+		host = strdup(http_host); // when there is no proxy, server name is identical to http_host name.
+		port = strdup(*q? q : "80");
+		path = strdup(*p? p : "/");
+	} else {
+		host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+		for (q = host; *q && *q != ':'; ++q);
+		if (*q == ':') *q++ = 0; 
+		port = strdup(*q? q : "80");
+		path = strdup(fn);
+	}
+
+	/* connect; adapted from khttp_connect() in knetfile.c */
+	l = 0;
+	fd = socket_connect(host, port);
+	buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+	l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", path, http_host);
+	l += sprintf(buf + l, "\r\n");
+	write(fd, buf, l);
+	l = 0;
+	while (read(fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+		if (buf[l] == '\n' && l >= 3)
+			if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+		++l;
+	}
+	buf[l] = 0;
+	if (l < 14) { // prematured header
+		close(fd);
+		fd = -1;
+	}
+	ret = strtol(buf + 8, &p, 0); // HTTP return code
+	if (ret != 200) {
+		close(fd);
+		fd = -1;
+	}
+	free(buf); free(http_host); free(host); free(port); free(path);
+	return fd;
+}
+
+typedef struct {
+	int max_response, ctrl_fd;
+	char *response;
+} ftpaux_t;
+
+static int kftp_get_response(ftpaux_t *aux)
+{
+	unsigned char c;
+	int n = 0;
+	char *p;
+	if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0;
+	while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+		if (n >= aux->max_response) {
+			aux->max_response = aux->max_response? aux->max_response<<1 : 256;
+			aux->response = realloc(aux->response, aux->max_response);
+		}
+		aux->response[n++] = c;
+		if (c == '\n') {
+			if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2])
+				&& aux->response[3] != '-') break;
+			n = 0;
+			continue;
+		}
+	}
+	if (n < 2) return -1;
+	aux->response[n-2] = 0;
+	return strtol(aux->response, &p, 0);
+}
+
+static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get)
+{
+	if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+	write(aux->ctrl_fd, cmd, strlen(cmd));
+	return is_get? kftp_get_response(aux) : 0;
+}
+
+static int ftp_open(const char *fn)
+{
+	char *p, *host = 0, *port = 0, *retr = 0;
+	char host2[80], port2[10];
+	int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4];
+	ftpaux_t aux;
+	
+	/* parse URL */
+	if (strstr(fn, "ftp://") != fn) return 0;
+	for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+	if (*p != '/') return 0;
+	l = p - fn - 6;
+	port = strdup("21");
+	host = calloc(l + 1, 1);
+	strncpy(host, fn + 6, l);
+	retr = calloc(strlen(p) + 8, 1);
+	sprintf(retr, "RETR %s\r\n", p);
+	
+	/* connect to ctrl */
+	memset(&aux, 0, sizeof(ftpaux_t));
+	aux.ctrl_fd = socket_connect(host, port);
+	if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */
+
+	/* connect to the data stream */
+	kftp_get_response(&aux);
+	kftp_send_cmd(&aux, "USER anonymous\r\n", 1);
+	kftp_send_cmd(&aux, "PASS kopen@\r\n", 1);
+	kftp_send_cmd(&aux, "TYPE I\r\n", 1);
+	kftp_send_cmd(&aux, "PASV\r\n", 1);
+	for (p = aux.response; *p && *p != '('; ++p);
+	if (*p != '(') goto ftp_open_end;
+	++p;
+	sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+	memcpy(pasv_ip, v, 4 * sizeof(int));
+	pasv_port = (v[4]<<8&0xff00) + v[5];
+	kftp_send_cmd(&aux, retr, 0);
+	sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);
+	sprintf(port2, "%d", pasv_port);
+	fd = socket_connect(host2, port2);
+	if (fd == -1) goto ftp_open_end;
+	ret = kftp_get_response(&aux);
+	if (ret != 150) {
+		close(fd);
+		fd = -1;
+	}
+	close(aux.ctrl_fd);
+
+ftp_open_end:
+	free(host); free(port); free(retr); free(aux.response);
+	return fd;
+}
+#endif /* !defined(_KO_NO_NET) */
+
+static char **cmd2argv(const char *cmd)
+{
+	int i, beg, end, argc;
+	char **argv, *p, *q, *str;
+	end = strlen(cmd);
+	for (i = end - 1; i >= 0; --i)
+		if (!isspace(cmd[i])) break;
+	end = i + 1;
+	for (beg = 0; beg < end; ++beg)
+		if (!isspace(cmd[beg])) break;
+	if (beg == end) return 0;
+	for (i = beg + 1, argc = 0; i < end; ++i)
+		if (isspace(cmd[i]) && !isspace(cmd[i-1]))
+			++argc;
+	argv = (char**)calloc(argc + 2, sizeof(void*));
+	argv[0] = str = (char*)calloc(end - beg + 1, 1);
+	strncpy(argv[0], cmd + beg, end - beg);
+	for (i = argc = 1, q = p = str; i < end - beg; ++i)
+		if (isspace(str[i])) str[i] = 0;
+		else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i];
+	return argv;
+}
+
+#define KO_STDIN    1
+#define KO_FILE     2
+#define KO_PIPE     3
+#define KO_HTTP     4
+#define KO_FTP      5
+
+typedef struct {
+	int type, fd;
+	pid_t pid;
+} koaux_t;
+
+void *kopen(const char *fn, int *_fd)
+{
+	koaux_t *aux = 0;
+	*_fd = -1;
+	if (strstr(fn, "http://") == fn) {
+		aux = calloc(1, sizeof(koaux_t));
+		aux->type = KO_HTTP;
+		aux->fd = http_open(fn);
+	} else if (strstr(fn, "ftp://") == fn) {
+		aux = calloc(1, sizeof(koaux_t));
+		aux->type = KO_FTP;
+		aux->fd = ftp_open(fn);
+	} else if (strcmp(fn, "-") == 0) {
+		aux = calloc(1, sizeof(koaux_t));
+		aux->type = KO_STDIN;
+		aux->fd = STDIN_FILENO;
+	} else {
+		const char *p, *q;
+		for (p = fn; *p; ++p)
+			if (!isspace(*p)) break;
+		if (*p == '<') { // pipe open
+			int need_shell, pfd[2];
+			pid_t pid;
+			// a simple check to see if we need to invoke a shell; not always working
+			for (q = p + 1; *q; ++q)
+				if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':')
+					break;
+			need_shell = (*q != 0);
+			pipe(pfd);
+			pid = vfork();
+			if (pid == -1) { /* vfork() error */
+				close(pfd[0]); close(pfd[1]);
+				return 0;
+			}
+			if (pid == 0) { /* the child process */
+				char **argv; /* FIXME: I do not know if this will lead to a memory leak */
+				close(pfd[0]);
+				dup2(pfd[1], STDOUT_FILENO);
+				close(pfd[1]);
+				if (!need_shell) {
+					argv = cmd2argv(p + 1);
+					execvp(argv[0], argv);
+					free(argv[0]); free(argv);
+				} else execl("/bin/sh", "sh", "-c", p + 1, NULL);
+				exit(1);
+			} else { /* parent process */
+				close(pfd[1]);
+				aux = calloc(1, sizeof(koaux_t));
+				aux->type = KO_PIPE;
+				aux->fd = pfd[0];
+				aux->pid = pid;
+			}
+		} else {
+#ifdef _WIN32
+			*_fd = open(fn, O_RDONLY | O_BINARY);
+#else
+			*_fd = open(fn, O_RDONLY);
+#endif
+			if (*_fd) {
+				aux = calloc(1, sizeof(koaux_t));
+				aux->type = KO_FILE;
+				aux->fd = *_fd;
+			}
+		}
+	}
+	*_fd = aux->fd;
+	return aux;
+}
+
+int kclose(void *a)
+{
+	koaux_t *aux = (koaux_t*)a;
+	if (aux->type == KO_PIPE) {
+		int status;
+		pid_t pid;
+		pid = waitpid(aux->pid, &status, WNOHANG);
+		if (pid != aux->pid) kill(aux->pid, 15);
+	}
+	return 0;
+}
+
+#ifdef _KO_MAIN
+#define BUF_SIZE 0x10000
+int main(int argc, char *argv[])
+{
+	void *x;
+	int l, fd;
+	unsigned char buf[BUF_SIZE];
+	FILE *fp;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: kopen <file>\n");
+		return 1;
+	}
+	x = kopen(argv[1], &fd);
+	fp = fdopen(fd, "r");
+	if (fp == 0) {
+		fprintf(stderr, "ERROR: fail to open the input\n");
+		return 1;
+	}
+	do {
+		if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0)
+			fwrite(buf, 1, l, stdout);
+	} while (l == BUF_SIZE);
+	fclose(fp);
+	kclose(x);
+	return 0;
+}
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/ksa.c b/web/server/h2o/libh2o/deps/klib/ksa.c
new file mode 100644
index 000000000..18f686d11
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/ksa.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2008 Yuta Mori    All Rights Reserved.
+ *               2011 Attractive Chaos <attractor@live.co.uk>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* This is a library for constructing the suffix array for a string containing
+ * multiple sentinels with sentinels all represented by 0. The last symbol in
+ * the string must be a sentinel. The library is modified from an early version
+ * of Yuta Mori's SAIS library, but is slower than the lastest SAIS by about
+ * 30%, partly due to the recent optimization Yuta has applied and partly due
+ * to the extra comparisons between sentinels. This is not the first effort in
+ * supporting multi-sentinel strings, but is probably the easiest to use. */
+
+#include <stdlib.h>
+
+#ifdef _KSA64
+#include <stdint.h>
+typedef int64_t saint_t;
+#define SAINT_MAX INT64_MAX
+#define SAIS_CORE ksa_core64
+#define SAIS_BWT  ksa_bwt64
+#define SAIS_MAIN ksa_sa64
+#else
+#include <limits.h>
+typedef int saint_t;
+#define SAINT_MAX INT_MAX
+#define SAIS_CORE ksa_core
+#define SAIS_BWT  ksa_bwt
+#define SAIS_MAIN ksa_sa
+#endif
+
+/* T is of type "const unsigned char*". If T[i] is a sentinel, chr(i) takes a negative value */
+#define chr(i) (cs == sizeof(saint_t) ? ((const saint_t *)T)[i] : (T[i]? (saint_t)T[i] : i - SAINT_MAX))
+
+/** Count the occurrences of each symbol */
+static void getCounts(const unsigned char *T, saint_t *C, saint_t n, saint_t k, int cs)
+{
+	saint_t i;
+	for (i = 0; i < k; ++i) C[i] = 0;
+	for (i = 0; i < n; ++i) {
+		saint_t c = chr(i);
+		++C[c > 0? c : 0];
+	}
+}
+
+/**
+ * Find the end of each bucket
+ *
+ * @param C   occurrences computed by getCounts(); input
+ * @param B   start/end of each bucket; output
+ * @param k   size of alphabet
+ * @param end compute the end of bucket if true; otherwise compute the end
+ */
+static inline void getBuckets(const saint_t *C, saint_t *B, saint_t k, saint_t end)
+{
+	saint_t i, sum = 0;
+	if (end) for (i = 0; i < k; ++i) sum += C[i], B[i] = sum;
+	else for (i = 0; i < k; ++i) sum += C[i], B[i] = sum - C[i];
+}
+
+/** Induced sort */
+static void induceSA(const unsigned char *T, saint_t *SA, saint_t *C, saint_t *B, saint_t n, saint_t k, saint_t cs)
+{
+	saint_t *b, i, j;
+	saint_t  c0, c1;
+	/* left-to-right induced sort (for L-type) */
+	if (C == B) getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 0);	/* find starts of buckets */
+	for (i = 0, b = 0, c1 = -1; i < n; ++i) {
+		j = SA[i], SA[i] = ~j;
+		if (0 < j) { /* >0 if j-1 is L-type; <0 if S-type; ==0 undefined */
+			--j;
+			if ((c0 = chr(j)) != c1) {
+				B[c1 > 0? c1 : 0] = b - SA;
+				c1 = c0;
+				b = SA + B[c1 > 0? c1 : 0];
+			}
+			*b++ = (0 < j && chr(j - 1) < c1) ? ~j : j;
+		}
+	}
+	/* right-to-left induced sort (for S-type) */
+	if (C == B) getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 1);	/* find ends of buckets */
+	for (i = n - 1, b = 0, c1 = -1; 0 <= i; --i) {
+		if (0 < (j = SA[i])) { /* the prefix is S-type */
+			--j;
+			if ((c0 = chr(j)) != c1) {
+				B[c1 > 0? c1 : 0] = b - SA;
+				c1 = c0;
+				b = SA + B[c1 > 0? c1 : 0];
+			}
+			if (c0 > 0) *--b = (j == 0 || chr(j - 1) > c1) ? ~j : j;
+		} else SA[i] = ~j; /* if L-type, change the sign */
+	}
+}
+
+/**
+ * Recursively construct the suffix array for a string containing multiple
+ * sentinels. NULL is taken as the sentinel.
+ *
+ * @param T   NULL terminated input string (there can be multiple NULLs)
+ * @param SA  output suffix array
+ * @param fs  working space available in SA (typically 0 when first called)
+ * @param n   length of T, including the trailing NULL
+ * @param k   size of the alphabet (typically 256 when first called)
+ * @param cs  # bytes per element in T; 1 or sizeof(saint_t) (typically 1 when first called)
+ *
+ * @return    0 upon success
+ */
+int SAIS_CORE(const unsigned char *T, saint_t *SA, saint_t fs, saint_t n, saint_t k, int cs)
+{
+	saint_t *C, *B;
+	saint_t  i, j, c, m, q, qlen, name;
+	saint_t  c0, c1;
+
+	/* STAGE I: reduce the problem by at least 1/2 sort all the S-substrings */
+	if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C;
+	else {
+		if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2;
+		B = cs == 1? C + k : C;
+	}
+	getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 1);	/* find ends of buckets */
+	for (i = 0; i < n; ++i) SA[i] = 0;
+	/* mark L and S (the t array in Nong et al.), and keep the positions of LMS in the buckets */
+	for (i = n - 2, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+		if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1); c==1 if in an S run */
+		else if (c) SA[--B[c1 > 0? c1 : 0]] = i + 1, c = 0;
+	}
+	induceSA(T, SA, C, B, n, k, cs);
+	if (fs < k) free(C);
+	/* pack all the sorted LMS into the first m items of SA 
+	   2*m must be not larger than n (see Nong et al. for the proof) */
+	for (i = 0, m = 0; i < n; ++i) {
+		saint_t p = SA[i];
+		if (p == n - 1) SA[m++] = p;
+		else if (0 < p && chr(p - 1) > (c0 = chr(p))) {
+			for (j = p + 1; j < n && c0 == (c1 = chr(j)); ++j);
+			if (j < n && c0 < c1) SA[m++] = p;
+		}
+	}
+	for (i = m; i < n; ++i) SA[i] = 0;	/* init the name array buffer */
+	/* store the length of all substrings */
+	for (i = n - 2, j = n, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+		if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1) */
+		else if (c) SA[m + ((i + 1) >> 1)] = j - i - 1, j = i + 1, c = 0;
+	}
+	/* find the lexicographic names of all substrings */
+	for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
+		saint_t p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
+		if (plen == qlen) {
+			for (j = 0; j < plen && chr(p + j) == chr(q + j); j++);
+			if (j == plen) diff = 0;
+		}
+		if (diff) ++name, q = p, qlen = plen;
+		SA[m + (p >> 1)] = name;
+	}
+
+	/* STAGE II: solve the reduced problem; recurse if names are not yet unique */
+	if (name < m) {
+		saint_t *RA = SA + n + fs - m - 1;
+		for (i = n - 1, j = m - 1; m <= i; --i)
+			if (SA[i] != 0) RA[j--] = SA[i];
+		RA[m] = 0; // add a sentinel; in the resulting SA, SA[0]==m always stands
+		if (SAIS_CORE((unsigned char *)RA, SA, fs + n - m * 2 - 2, m + 1, name + 1, sizeof(saint_t)) != 0) return -2;
+		for (i = n - 2, j = m - 1, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
+			if ((c0 = chr(i)) < c1 + c) c = 1;
+			else if (c) RA[j--] = i + 1, c = 0; /* get p1 */
+		}
+		for (i = 0; i < m; ++i) SA[i] = RA[SA[i+1]]; /* get index  */
+	}
+
+	/* STAGE III: induce the result for the original problem */
+	if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C;
+	else {
+		if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2;
+		B = cs == 1? C + k : C;
+	}
+	/* put all LMS characters into their buckets */
+	getCounts(T, C, n, k, cs);
+	getBuckets(C, B, k, 1);	/* find ends of buckets */
+	for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
+	for (i = m - 1; 0 <= i; --i) {
+		j = SA[i], SA[i] = 0;
+		c = chr(j);
+		SA[--B[c > 0? c : 0]] = j;
+	}
+	induceSA(T, SA, C, B, n, k, cs);
+	if (fs < k) free(C);
+	return 0;
+}
+
+/**
+ * Construct the suffix array for a NULL terminated string possibly containing
+ * multiple sentinels (NULLs).
+ *
+ * @param T[0..n-1]  NULL terminated input string
+ * @param SA[0..n-1] output suffix array
+ * @param n          length of the given string, including NULL
+ * @param k          size of the alphabet including the sentinel; no more than 256
+ * @return           0 upon success
+ */
+int SAIS_MAIN(const unsigned char *T, saint_t *SA, saint_t n, int k)
+{
+	if (T == NULL || SA == NULL || T[n - 1] != '\0' || n <= 0) return -1;
+	if (k < 0 || k > 256) k = 256;
+	return SAIS_CORE(T, SA, 0, n, (saint_t)k, 1);
+}
+
+int SAIS_BWT(unsigned char *T, saint_t n, int k)
+{
+	saint_t *SA, i;
+	int ret;
+	if ((SA = malloc(n * sizeof(saint_t))) == 0) return -1;
+	if ((ret = SAIS_MAIN(T, SA, n, k)) != 0) return ret;
+	for (i = 0; i < n; ++i)
+		if (SA[i]) SA[i] = T[SA[i] - 1]; // if SA[i]==0, SA[i]=0
+	for (i = 0; i < n; ++i) T[i] = SA[i];
+	free(SA);
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/kseq.h b/web/server/h2o/libh2o/deps/klib/kseq.h
new file mode 100644
index 000000000..b2238d1d3
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kseq.h
@@ -0,0 +1,235 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Last Modified: 05MAR2012 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
+
+#define __KS_TYPE(type_t)						\
+	typedef struct __kstream_t {				\
+		unsigned char *buf;						\
+		int begin, end, is_eof;					\
+		type_t f;								\
+	} kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)								\
+	static inline kstream_t *ks_init(type_t f)						\
+	{																\
+		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
+		ks->f = f;													\
+		ks->buf = (unsigned char*)malloc(__bufsize);				\
+		return ks;													\
+	}																\
+	static inline void ks_destroy(kstream_t *ks)					\
+	{																\
+		if (ks) {													\
+			free(ks->buf);											\
+			free(ks);												\
+		}															\
+	}
+
+#define __KS_GETC(__read, __bufsize)						\
+	static inline int ks_getc(kstream_t *ks)				\
+	{														\
+		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
+		if (ks->begin >= ks->end) {							\
+			ks->begin = 0;									\
+			ks->end = __read(ks->f, ks->buf, __bufsize);	\
+			if (ks->end == 0) { ks->is_eof = 1; return -1;}	\
+		}													\
+		return (int)ks->buf[ks->begin++];					\
+	}
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)								\
+	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
+	{																	\
+		int gotany = 0;													\
+		if (dret) *dret = 0;											\
+		str->l = append? str->l : 0;									\
+		for (;;) {														\
+			int i;														\
+			if (ks->begin >= ks->end) {									\
+				if (!ks->is_eof) {										\
+					ks->begin = 0;										\
+					ks->end = __read(ks->f, ks->buf, __bufsize);		\
+					if (ks->end == 0) { ks->is_eof = 1; break; }		\
+				} else break;											\
+			}															\
+			if (delimiter == KS_SEP_LINE) { \
+				for (i = ks->begin; i < ks->end; ++i) \
+					if (ks->buf[i] == '\n') break; \
+			} else if (delimiter > KS_SEP_MAX) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (ks->buf[i] == delimiter) break;					\
+			} else if (delimiter == KS_SEP_SPACE) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i])) break;						\
+			} else if (delimiter == KS_SEP_TAB) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+			} else i = 0; /* never come to here! */						\
+			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
+				str->m = str->l + (i - ks->begin) + 1;					\
+				kroundup32(str->m);										\
+				str->s = (char*)realloc(str->s, str->m);				\
+			}															\
+			gotany = 1;													\
+			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+			str->l = str->l + (i - ks->begin);							\
+			ks->begin = i + 1;											\
+			if (i < ks->end) {											\
+				if (dret) *dret = ks->buf[i];							\
+				break;													\
+			}															\
+		}																\
+		if (!gotany && ks_eof(ks)) return -1;							\
+		if (str->s == 0) {												\
+			str->m = 1;													\
+			str->s = (char*)calloc(1, 1);								\
+		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
+		str->s[str->l] = '\0';											\
+		return str->l;													\
+	} \
+	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+	__KS_TYPE(type_t)							\
+	__KS_BASIC(type_t, __bufsize)				\
+	__KS_GETC(__read, __bufsize)				\
+	__KS_GETUNTIL(__read, __bufsize)
+
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)										\
+	SCOPE kseq_t *kseq_init(type_t fd)									\
+	{																	\
+		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
+		s->f = ks_init(fd);												\
+		return s;														\
+	}																	\
+	SCOPE void kseq_destroy(kseq_t *ks)									\
+	{																	\
+		if (!ks) return;												\
+		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
+		ks_destroy(ks->f);												\
+		free(ks);														\
+	}
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ(SCOPE) \
+	SCOPE int kseq_read(kseq_t *seq) \
+	{ \
+		int c; \
+		kstream_t *ks = seq->f; \
+		if (seq->last_char == 0) { /* then jump to the next header line */ \
+			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+			if (c == -1) return -1; /* end of file */ \
+			seq->last_char = c; \
+		} /* else: the first header char has been read in the previous call */ \
+		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+			seq->seq.m = 256; \
+			seq->seq.s = (char*)malloc(seq->seq.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+			if (c == '\n') continue; /* skip empty lines */ \
+			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+		} \
+		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
+		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+			seq->seq.m = seq->seq.l + 2; \
+			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+		} \
+		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
+		if (c != '+') return seq->seq.l; /* FASTA */ \
+		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
+			seq->qual.m = seq->seq.m; \
+			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+		} \
+		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+		if (c == -1) return -2; /* error: no quality string */ \
+		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+		seq->last_char = 0;	/* we have not come to the next header line */ \
+		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+		return seq->seq.l; \
+	}
+
+#define __KSEQ_TYPE(type_t)						\
+	typedef struct {							\
+		kstring_t name, comment, seq, qual;		\
+		int last_char;							\
+		kstream_t *f;							\
+	} kseq_t;
+
+#define KSEQ_INIT2(SCOPE, type_t, __read)		\
+	KSTREAM_INIT(type_t, __read, 16384)			\
+	__KSEQ_TYPE(type_t)							\
+	__KSEQ_BASIC(SCOPE, type_t)					\
+	__KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+	__KS_TYPE(type_t) \
+	__KSEQ_TYPE(type_t) \
+	extern kseq_t *kseq_init(type_t fd); \
+	void kseq_destroy(kseq_t *ks); \
+	int kseq_read(kseq_t *seq);
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kson.c b/web/server/h2o/libh2o/deps/klib/kson.c
new file mode 100644
index 000000000..a8bf1601f
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kson.c
@@ -0,0 +1,253 @@
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include "kson.h"
+
+/*************
+ *** Parse ***
+ *************/
+
+kson_node_t *kson_parse_core(const char *json, long *_n, int *error, long *parsed_len)
+{
+	long *stack = 0, top = 0, max = 0, n_a = 0, m_a = 0, i, j;
+	kson_node_t *a = 0, *u;
+	const char *p, *q;
+	size_t *tmp;
+
+#define __push_back(y) do { \
+		if (top == max) { \
+			max = max? max<<1 : 4; \
+			stack = (long*)realloc(stack, sizeof(long) * max); \
+		} \
+		stack[top++] = (y); \
+	} while (0)
+
+#define __new_node(z) do { \
+		if (n_a == m_a) { \
+			long old_m = m_a; \
+			m_a = m_a? m_a<<1 : 4; \
+			a = (kson_node_t*)realloc(a, sizeof(kson_node_t) * m_a); \
+			memset(a + old_m, 0, sizeof(kson_node_t) * (m_a - old_m)); \
+		} \
+		*(z) = &a[n_a++]; \
+	} while (0)
+
+	assert(sizeof(size_t) == sizeof(kson_node_t*));
+	*error = KSON_OK;
+	for (p = json; *p; ++p) {
+		while (*p && isspace(*p)) ++p;
+		if (*p == 0) break;
+		if (*p == ',') { // comma is somewhat redundant
+		} else if (*p == '[' || *p == '{') {
+			int t = *p == '['? -1 : -2;
+			if (top < 2 || stack[top-1] != -3) { // unnamed internal node
+				__push_back(n_a);
+				__new_node(&u);
+				__push_back(t);
+			} else stack[top-1] = t; // named internal node
+		} else if (*p == ']' || *p == '}') {
+			long i, start, t = *p == ']'? -1 : -2;
+			for (i = top - 1; i >= 0 && stack[i] != t; --i);
+			if (i < 0) { // error: an extra right bracket
+				*error = KSON_ERR_EXTRA_RIGHT;
+				break;
+			}
+			start = i;
+			u = &a[stack[start-1]];
+			u->key = u->v.str;
+			u->n = top - 1 - start;
+			u->v.child = (kson_node_t**)malloc(u->n * sizeof(kson_node_t*));
+			tmp = (size_t*)u->v.child;
+			for (i = start + 1; i < top; ++i)
+				tmp[i - start - 1] = stack[i];
+			u->type = *p == ']'? KSON_TYPE_BRACKET : KSON_TYPE_BRACE;
+			if ((top = start) == 1) break; // completed one object; remaining characters discarded
+		} else if (*p == ':') {
+			if (top == 0 || stack[top-1] == -3) {
+				*error = KSON_ERR_NO_KEY;
+				break;
+			}
+			__push_back(-3);
+		} else {
+			int c = *p;
+			// get the node to modify
+			if (top >= 2 && stack[top-1] == -3) { // we have a key:value pair here
+				--top;
+				u = &a[stack[top-1]];
+				u->key = u->v.str; // move old value to key
+			} else { // don't know if this is a bare value or a key:value pair; keep it as a value for now
+				__push_back(n_a);
+				__new_node(&u);
+			}
+			// parse string
+			if (c == '\'' || c == '"') {
+				for (q = ++p; *q && *q != c; ++q)
+					if (*q == '\\') ++q;
+			} else {
+				for (q = p; *q && *q != ']' && *q != '}' && *q != ',' && *q != ':' && *q != '\n'; ++q)
+					if (*q == '\\') ++q;
+			}
+			u->v.str = (char*)malloc(q - p + 1); strncpy(u->v.str, p, q - p); u->v.str[q-p] = 0; // equivalent to u->v.str=strndup(p, q-p)
+			u->type = c == '\''? KSON_TYPE_SGL_QUOTE : c == '"'? KSON_TYPE_DBL_QUOTE : KSON_TYPE_NO_QUOTE;
+			p = c == '\'' || c == '"'? q : q - 1;
+		}
+	}
+	while (*p && isspace(*p)) ++p; // skip trailing blanks
+	if (parsed_len) *parsed_len = p - json;
+	if (top != 1) *error = KSON_ERR_EXTRA_LEFT;
+
+	for (i = 0; i < n_a; ++i)
+		for (j = 0, u = &a[i], tmp = (size_t*)u->v.child; j < (long)u->n; ++j)
+			u->v.child[j] = &a[tmp[j]];
+
+	free(stack);
+	*_n = n_a;
+	return a;
+}
+
+void kson_destroy(kson_t *kson)
+{
+	long i;
+	if (kson == 0) return;
+	for (i = 0; i < kson->n_nodes; ++i) {
+		free(kson->root[i].key); free(kson->root[i].v.str);
+	}
+	free(kson->root); free(kson);
+}
+
+kson_t *kson_parse(const char *json)
+{
+	kson_t *kson;
+	int error;
+	kson = (kson_t*)calloc(1, sizeof(kson_t));
+	kson->root = kson_parse_core(json, &kson->n_nodes, &error, 0);
+	if (error) {
+		kson_destroy(kson);
+		return 0;
+	}
+	return kson;
+}
+
+/*************
+ *** Query ***
+ *************/
+
+const kson_node_t *kson_by_path(const kson_node_t *p, int depth, ...)
+{
+	va_list ap;
+	va_start(ap, depth);
+	while (p && depth > 0) {
+		if (p->type == KSON_TYPE_BRACE) {
+			p = kson_by_key(p, va_arg(ap, const char*));
+		} else if (p->type == KSON_TYPE_BRACKET) {
+			p = kson_by_index(p, va_arg(ap, long));
+		} else break;
+		--depth;
+	}
+	va_end(ap);
+	return p;
+}
+
+/**************
+ *** Fromat ***
+ **************/
+
+void kson_format_recur(const kson_node_t *p, int depth)
+{
+	long i;
+	if (p->key) printf("\"%s\":", p->key);
+	if (p->type == KSON_TYPE_BRACKET || p->type == KSON_TYPE_BRACE) {
+		putchar(p->type == KSON_TYPE_BRACKET? '[' : '{');
+		if (p->n) {
+			putchar('\n'); for (i = 0; i <= depth; ++i) fputs("  ", stdout);
+			for (i = 0; i < (long)p->n; ++i) {
+				if (i) {
+					int i;
+					putchar(',');
+					putchar('\n'); for (i = 0; i <= depth; ++i) fputs("  ", stdout);
+				}
+				kson_format_recur(p->v.child[i], depth + 1);
+			}
+			putchar('\n'); for (i = 0; i < depth; ++i) fputs("  ", stdout);
+		}
+		putchar(p->type == KSON_TYPE_BRACKET? ']' : '}');
+	} else {
+		if (p->type != KSON_TYPE_NO_QUOTE)
+			putchar(p->type == KSON_TYPE_SGL_QUOTE? '\'' : '"');
+		fputs(p->v.str, stdout);
+		if (p->type != KSON_TYPE_NO_QUOTE)
+			putchar(p->type == KSON_TYPE_SGL_QUOTE? '\'' : '"');
+	}
+}
+
+void kson_format(const kson_node_t *root)
+{
+	kson_format_recur(root, 0);
+	putchar('\n');
+}
+
+/*********************
+ *** Main function ***
+ *********************/
+
+#ifdef KSON_MAIN
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+int main(int argc, char *argv[])
+{
+	kson_t *kson = 0;
+	if (argc > 1) {
+		FILE *fp;
+		int len = 0, max = 0, tmp, i;
+		char *json = 0, buf[0x10000];
+		if ((fp = fopen(argv[1], "rb")) != 0) {
+			// read the entire file into a string
+			while ((tmp = fread(buf, 1, 0x10000, fp)) != 0) {
+				if (len + tmp + 1 > max) {
+					max = len + tmp + 1;
+					kroundup32(max);
+					json = (char*)realloc(json, max);
+				}
+				memcpy(json + len, buf, tmp);
+				len += tmp;
+			}
+			fclose(fp);
+			// parse
+			kson = kson_parse(json);
+			free(json);
+			if (kson) {
+				kson_format(kson->root);
+				if (argc > 2) {
+					// path finding
+					const kson_node_t *p = kson->root;
+					for (i = 2; i < argc && p; ++i) {
+						if (p->type == KSON_TYPE_BRACKET)
+							p = kson_by_index(p, atoi(argv[i]));
+						else if (p->type == KSON_TYPE_BRACE)
+							p = kson_by_key(p, argv[i]);
+						else p = 0;
+					}
+					if (p) {
+						if (kson_is_internal(p)) printf("Reached an internal node\n");
+						else printf("Value: %s\n", p->v.str);
+					} else printf("Failed to find the slot\n");
+				}
+			} else printf("Failed to parse\n");
+		}
+	} else {
+		kson = kson_parse("{'a' : 1,'b':[0,'isn\\'t',true],'d':[{\n\n\n}]}");
+		if (kson) {
+			const kson_node_t *p = kson_by_path(kson->root, 2, "b", 1);
+			if (p) printf("*** %s\n", p->v.str);
+			else printf("!!! not found\n");
+			kson_format(kson->root);
+		} else {
+			printf("Failed to parse\n");
+		}
+	}
+	kson_destroy(kson);
+	return 0;
+}
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kson.h b/web/server/h2o/libh2o/deps/klib/kson.h
new file mode 100644
index 000000000..a03eb52f5
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kson.h
@@ -0,0 +1,64 @@
+#ifndef KSON_H
+#define KSON_H
+
+#include <string.h>
+
+#define KSON_TYPE_NO_QUOTE  1
+#define KSON_TYPE_SGL_QUOTE 2
+#define KSON_TYPE_DBL_QUOTE 3
+#define KSON_TYPE_BRACKET   4
+#define KSON_TYPE_BRACE     5
+
+#define KSON_OK              0
+#define KSON_ERR_EXTRA_LEFT  1
+#define KSON_ERR_EXTRA_RIGHT 2
+#define KSON_ERR_NO_KEY      3
+
+typedef struct kson_node_s {
+	unsigned long long type:3, n:61;
+	char *key;
+	union {
+		struct kson_node_s **child;
+		char *str;
+	} v;
+} kson_node_t;
+
+typedef struct {
+	long n_nodes;
+	kson_node_t *root;
+} kson_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	kson_t *kson_parse(const char *json);
+	void kson_destroy(kson_t *kson);
+	const kson_node_t *kson_by_path(const kson_node_t *root, int path_len, ...);
+	void kson_format(const kson_node_t *root);
+
+#ifdef __cplusplus
+}
+#endif
+
+#define kson_is_internal(p) ((p)->type == KSON_TYPE_BRACKET || (p)->type == KSON_TYPE_BRACE)
+
+static inline const kson_node_t *kson_by_key(const kson_node_t *p, const char *key)
+{
+	long i;
+	if (!kson_is_internal(p)) return 0;
+	for (i = 0; i < (long)p->n; ++i) {
+		const kson_node_t *q = p->v.child[i];
+		if (q->key && strcmp(q->key, key) == 0)
+			return q;
+	}
+	return 0;
+}
+
+static inline const kson_node_t *kson_by_index(const kson_node_t *p, long i)
+{
+	if (!kson_is_internal(p)) return 0;
+	return 0 <= i && i < (long)p->n? p->v.child[i] : 0;
+}
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/ksort.h b/web/server/h2o/libh2o/deps/klib/ksort.h
new file mode 100644
index 000000000..4da7a13ef
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/ksort.h
@@ -0,0 +1,298 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2011 Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  2011-04-10 (0.1.6):
+
+  	* Added sample
+
+  2011-03 (0.1.5):
+
+	* Added shuffle/permutation
+
+  2008-11-16 (0.1.4):
+
+    * Fixed a bug in introsort() that happens in rare cases.
+
+  2008-11-05 (0.1.3):
+
+    * Fixed a bug in introsort() for complex comparisons.
+
+	* Fixed a bug in mergesort(). The previous version is not stable.
+
+  2008-09-15 (0.1.2):
+
+	* Accelerated introsort. On my Mac (not on another Linux machine),
+	  my implementation is as fast as std::sort on random input.
+
+	* Added combsort and in introsort, switch to combsort if the
+	  recursion is too deep.
+
+  2008-09-13 (0.1.1):
+
+	* Added k-small algorithm
+
+  2008-09-05 (0.1.0):
+
+	* Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+	void *left, *right;
+	int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt)								\
+	void ks_mergesort_##name(size_t n, type_t array[], type_t temp[])	\
+	{																	\
+		type_t *a2[2], *a, *b;											\
+		int curr, shift;												\
+																		\
+		a2[0] = array;													\
+		a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n);		\
+		for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) {			\
+			a = a2[curr]; b = a2[1-curr];								\
+			if (shift == 0) {											\
+				type_t *p = b, *i, *eb = a + n;							\
+				for (i = a; i < eb; i += 2) {							\
+					if (i == eb - 1) *p++ = *i;							\
+					else {												\
+						if (__sort_lt(*(i+1), *i)) {					\
+							*p++ = *(i+1); *p++ = *i;					\
+						} else {										\
+							*p++ = *i; *p++ = *(i+1);					\
+						}												\
+					}													\
+				}														\
+			} else {													\
+				size_t i, step = 1ul<<shift;							\
+				for (i = 0; i < n; i += step<<1) {						\
+					type_t *p, *j, *k, *ea, *eb;						\
+					if (n < i + step) {									\
+						ea = a + n; eb = a;								\
+					} else {											\
+						ea = a + i + step;								\
+						eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+					}													\
+					j = a + i; k = a + i + step; p = b + i;				\
+					while (j < ea && k < eb) {							\
+						if (__sort_lt(*k, *j)) *p++ = *k++;				\
+						else *p++ = *j++;								\
+					}													\
+					while (j < ea) *p++ = *j++;							\
+					while (k < eb) *p++ = *k++;							\
+				}														\
+			}															\
+			curr = 1 - curr;											\
+		}																\
+		if (curr == 1) {												\
+			type_t *p = a2[0], *i = a2[1], *eb = array + n;				\
+			for (; p < eb; ++i) *p++ = *i;								\
+		}																\
+		if (temp == 0) free(a2[1]);										\
+	}																	\
+	void ks_heapadjust_##name(size_t i, size_t n, type_t l[])			\
+	{																	\
+		size_t k = i;													\
+		type_t tmp = l[i];												\
+		while ((k = (k << 1) + 1) < n) {								\
+			if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k;				\
+			if (__sort_lt(l[k], tmp)) break;							\
+			l[i] = l[k]; i = k;											\
+		}																\
+		l[i] = tmp;														\
+	}																	\
+	void ks_heapmake_##name(size_t lsize, type_t l[])					\
+	{																	\
+		size_t i;														\
+		for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i)				\
+			ks_heapadjust_##name(i, lsize, l);							\
+	}																	\
+	void ks_heapsort_##name(size_t lsize, type_t l[])					\
+	{																	\
+		size_t i;														\
+		for (i = lsize - 1; i > 0; --i) {								\
+			type_t tmp;													\
+			tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
+		}																\
+	}																	\
+	static inline void __ks_insertsort_##name(type_t *s, type_t *t)			\
+	{																	\
+		type_t *i, *j, swap_tmp;										\
+		for (i = s + 1; i < t; ++i)										\
+			for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) {			\
+				swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp;			\
+			}															\
+	}																	\
+	void ks_combsort_##name(size_t n, type_t a[])						\
+	{																	\
+		const double shrink_factor = 1.2473309501039786540366528676643; \
+		int do_swap;													\
+		size_t gap = n;													\
+		type_t tmp, *i, *j;												\
+		do {															\
+			if (gap > 2) {												\
+				gap = (size_t)(gap / shrink_factor);					\
+				if (gap == 9 || gap == 10) gap = 11;					\
+			}															\
+			do_swap = 0;												\
+			for (i = a; i < a + n - gap; ++i) {							\
+				j = i + gap;											\
+				if (__sort_lt(*j, *i)) {								\
+					tmp = *i; *i = *j; *j = tmp;						\
+					do_swap = 1;										\
+				}														\
+			}															\
+		} while (do_swap || gap > 2);									\
+		if (gap != 1) __ks_insertsort_##name(a, a + n);					\
+	}																	\
+	void ks_introsort_##name(size_t n, type_t a[])						\
+	{																	\
+		int d;															\
+		ks_isort_stack_t *top, *stack;									\
+		type_t rp, swap_tmp;											\
+		type_t *s, *t, *i, *j, *k;										\
+																		\
+		if (n < 1) return;												\
+		else if (n == 2) {												\
+			if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+			return;														\
+		}																\
+		for (d = 2; 1ul<<d < n; ++d);									\
+		stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+		top = stack; s = a; t = a + (n-1); d <<= 1;						\
+		while (1) {														\
+			if (s < t) {												\
+				if (--d == 0) {											\
+					ks_combsort_##name(t - s + 1, s);					\
+					t = s;												\
+					continue;											\
+				}														\
+				i = s; j = t; k = i + ((j-i)>>1) + 1;					\
+				if (__sort_lt(*k, *i)) {								\
+					if (__sort_lt(*k, *j)) k = j;						\
+				} else k = __sort_lt(*j, *i)? i : j;					\
+				rp = *k;												\
+				if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; }	\
+				for (;;) {												\
+					do ++i; while (__sort_lt(*i, rp));					\
+					do --j; while (i <= j && __sort_lt(rp, *j));		\
+					if (j <= i) break;									\
+					swap_tmp = *i; *i = *j; *j = swap_tmp;				\
+				}														\
+				swap_tmp = *i; *i = *t; *t = swap_tmp;					\
+				if (i-s > t-i) {										\
+					if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+					s = t-i > 16? i+1 : t;								\
+				} else {												\
+					if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+					t = i-s > 16? i-1 : s;								\
+				}														\
+			} else {													\
+				if (top == stack) {										\
+					free(stack);										\
+					__ks_insertsort_##name(a, a+n);						\
+					return;												\
+				} else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+			}															\
+		}																\
+	}																	\
+	/* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+	/* 0 <= kk < n */													\
+	type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)			\
+	{																	\
+		type_t *low, *high, *k, *ll, *hh, *mid;							\
+		low = arr; high = arr + n - 1; k = arr + kk;					\
+		for (;;) {														\
+			if (high <= low) return *k;									\
+			if (high == low + 1) {										\
+				if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+				return *k;												\
+			}															\
+			mid = low + (high - low) / 2;								\
+			if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+			if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+			if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);	\
+			KSORT_SWAP(type_t, *mid, *(low+1));							\
+			ll = low + 1; hh = high;									\
+			for (;;) {													\
+				do ++ll; while (__sort_lt(*ll, *low));					\
+				do --hh; while (__sort_lt(*low, *hh));					\
+				if (hh < ll) break;										\
+				KSORT_SWAP(type_t, *ll, *hh);							\
+			}															\
+			KSORT_SWAP(type_t, *low, *hh);								\
+			if (hh <= k) low = ll;										\
+			if (hh >= k) high = hh - 1;									\
+		}																\
+	}																	\
+	void ks_shuffle_##name(size_t n, type_t a[])						\
+	{																	\
+		int i, j;														\
+		for (i = n; i > 1; --i) {										\
+			type_t tmp;													\
+			j = (int)(drand48() * i);									\
+			tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;					\
+		}																\
+	}																	\
+	void ks_sample_##name(size_t n, size_t r, type_t a[]) /* FIXME: NOT TESTED!!! */ \
+	{ /* reference: http://code.activestate.com/recipes/272884/ */ \
+		int i, k, pop = n; \
+		for (i = (int)r, k = 0; i >= 0; --i) { \
+			double z = 1., x = drand48(); \
+			type_t tmp; \
+			while (x < z) z -= z * i / (pop--); \
+			if (k != n - pop - 1) tmp = a[k], a[k] = a[n-pop-1], a[n-pop-1] = tmp; \
+			++k; \
+		} \
+	}
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kstring.c b/web/server/h2o/libh2o/deps/klib/kstring.c
new file mode 100644
index 000000000..f0293172a
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kstring.c
@@ -0,0 +1,229 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include "kstring.h"
+
+int kvsprintf(kstring_t *s, const char *fmt, va_list ap)
+{
+	va_list args;
+	int l;
+	va_copy(args, ap);
+	l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'.
+	va_end(args);
+	if (l + 1 > s->m - s->l) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+		va_copy(args, ap);
+		l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args);
+		va_end(args);
+	}
+	s->l += l;
+	return l;
+}
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+	va_list ap;
+	int l;
+	va_start(ap, fmt);
+	l = kvsprintf(s, fmt, ap);
+	va_end(ap);
+	return l;
+}
+
+char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
+{
+	const char *p, *start;
+	if (sep) { // set up the table
+		if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
+		aux->finished = 0;
+		if (sep[1]) {
+			aux->sep = -1;
+			aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
+			for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
+		} else aux->sep = sep[0];
+	}
+	if (aux->finished) return 0;
+	else if (str) aux->p = str - 1, aux->finished = 0;
+	if (aux->sep < 0) {
+		for (p = start = aux->p + 1; *p; ++p)
+			if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
+	} else {
+		for (p = start = aux->p + 1; *p; ++p)
+			if (*p == aux->sep) break;
+	}
+	aux->p = p; // end of token
+	if (*p == 0) aux->finished = 1; // no more tokens
+	return (char*)start;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+	int i, n, max, last_char, last_start, *offsets, l;
+	n = 0; max = *_max; offsets = *_offsets;
+	l = strlen(s);
+	
+#define __ksplit_aux do {						\
+		if (_offsets) {						\
+			s[i] = 0;					\
+			if (n == max) {					\
+				int *tmp;				\
+				max = max? max<<1 : 2;			\
+				if ((tmp = (int*)realloc(offsets, sizeof(int) * max))) {  \
+					offsets = tmp;			\
+				} else	{				\
+					free(offsets);			\
+					*_offsets = NULL;		\
+					return 0;			\
+				}					\
+			}						\
+			offsets[n++] = last_start;			\
+		} else ++n;						\
+	} while (0)
+
+	for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+		if (delimiter == 0) {
+			if (isspace(s[i]) || s[i] == 0) {
+				if (isgraph(last_char)) __ksplit_aux; // the end of a field
+			} else {
+				if (isspace(last_char) || last_char == 0) last_start = i;
+			}
+		} else {
+			if (s[i] == delimiter || s[i] == 0) {
+				if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+			} else {
+				if (last_char == delimiter || last_char == 0) last_start = i;
+			}
+		}
+		last_char = s[i];
+	}
+	*_max = max; *_offsets = offsets;
+	return n;
+}
+
+/**********************
+ * Boyer-Moore search *
+ **********************/
+
+typedef unsigned char ubyte_t;
+
+// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
+static int *ksBM_prep(const ubyte_t *pat, int m)
+{
+	int i, *suff, *prep, *bmGs, *bmBc;
+	prep = (int*)calloc(m + 256, sizeof(int));
+	bmGs = prep; bmBc = prep + m;
+	{ // preBmBc()
+		for (i = 0; i < 256; ++i) bmBc[i] = m;
+		for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1;
+	}
+	suff = (int*)calloc(m, sizeof(int));
+	{ // suffixes()
+		int f = 0, g;
+		suff[m - 1] = m;
+		g = m - 1;
+		for (i = m - 2; i >= 0; --i) {
+			if (i > g && suff[i + m - 1 - f] < i - g)
+				suff[i] = suff[i + m - 1 - f];
+			else {
+				if (i < g) g = i;
+				f = i;
+				while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g;
+				suff[i] = f - g;
+			}
+		}
+	}
+	{ // preBmGs()
+		int j = 0;
+		for (i = 0; i < m; ++i) bmGs[i] = m;
+		for (i = m - 1; i >= 0; --i)
+			if (suff[i] == i + 1)
+				for (; j < m - 1 - i; ++j)
+					if (bmGs[j] == m)
+						bmGs[j] = m - 1 - i;
+		for (i = 0; i <= m - 2; ++i)
+			bmGs[m - 1 - suff[i]] = m - 1 - i;
+	}
+	free(suff);
+	return prep;
+}
+
+void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
+{
+	int i, j, *prep = 0, *bmGs, *bmBc;
+	const ubyte_t *str, *pat;
+	str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
+	prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
+	if (_prep && *_prep == 0) *_prep = prep;
+	bmGs = prep; bmBc = prep + m;
+	j = 0;
+	while (j <= n - m) {
+		for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
+		if (i >= 0) {
+			int max = bmBc[str[i+j]] - m + 1 + i;
+			if (max < bmGs[i]) max = bmGs[i];
+			j += max;
+		} else return (void*)(str + j);
+	}
+	if (_prep == 0) free(prep);
+	return 0;
+}
+
+char *kstrstr(const char *str, const char *pat, int **_prep)
+{
+	return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
+}
+
+char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
+{
+	return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
+}
+
+/***********************
+ * The main() function *
+ ***********************/
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+	kstring_t *s;
+	int *fields, n, i;
+	ks_tokaux_t aux;
+	char *p;
+	s = (kstring_t*)calloc(1, sizeof(kstring_t));
+	// test ksprintf()
+	ksprintf(s, " abcdefg:    %d ", 100);
+	printf("'%s'\n", s->s);
+	// test ksplit()
+	fields = ksplit(s, 0, &n);
+	for (i = 0; i < n; ++i)
+		printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+	// test kstrtok()
+	s->l = 0;
+	for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
+		kputsn(p, aux.p - p, s);
+		kputc('\n', s);
+	}
+	printf("%s", s->s);
+	// free
+	free(s->s); free(s); free(fields);
+
+	{
+		static char *str = "abcdefgcdgcagtcakcdcd";
+		static char *pat = "cd";
+		char *ret, *s = str;
+		int *prep = 0;
+		while ((ret = kstrstr(s, pat, &prep)) != 0) {
+			printf("match: %s\n", ret);
+			s = ret + prep[0];
+		}
+		free(prep);
+	}
+	return 0;
+}
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kstring.h b/web/server/h2o/libh2o/deps/klib/kstring.h
new file mode 100644
index 000000000..0e654cb82
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kstring.h
@@ -0,0 +1,259 @@
+/* The MIT License
+
+   Copyright (c) by Attractive Chaos <attractor@live.co.uk> 
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
+#define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__printf__, fmt, arg)))
+#else
+#define KS_ATTR_PRINTF(fmt, arg)
+#endif
+
+
+/* kstring_t is a simple non-opaque type whose fields are likely to be
+ * used directly by user code (but see also ks_str() and ks_len() below).
+ * A kstring_t object is initialised by either of
+ *       kstring_t str = { 0, 0, NULL };
+ *       kstring_t str; ...; str.l = str.m = 0; str.s = NULL;
+ * and either ownership of the underlying buffer should be given away before
+ * the object disappears (i.e., the str.s pointer copied and something else
+ * responsible for freeing it), or the kstring_t should be destroyed with
+ *       free(str.s);  */
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+	size_t l, m;
+	char *s;
+} kstring_t;
+#endif
+
+typedef struct {
+	uint64_t tab[4];
+	int sep, finished;
+	const char *p; // end of the current token
+} ks_tokaux_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	int kvsprintf(kstring_t *s, const char *fmt, va_list ap) KS_ATTR_PRINTF(2,0);
+	int ksprintf(kstring_t *s, const char *fmt, ...) KS_ATTR_PRINTF(2,3);
+	int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+	char *kstrstr(const char *str, const char *pat, int **_prep);
+	char *kstrnstr(const char *str, const char *pat, int n, int **_prep);
+	void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep);
+
+	/* kstrtok() is similar to strtok_r() except that str is not
+	 * modified and both str and sep can be NULL. For efficiency, it is
+	 * actually recommended to set both to NULL in the subsequent calls
+	 * if sep is not changed. */
+	char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int ks_resize(kstring_t *s, size_t size)
+{
+	if (s->m < size) {
+		char *tmp;
+		s->m = size;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return -1;
+	}
+	return 0;
+}
+
+static inline char *ks_str(kstring_t *s)
+{
+	return s->s;
+}
+
+static inline size_t ks_len(kstring_t *s)
+{
+	return s->l;
+}
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l;
+	s->s[s->l] = 0;
+	return l;
+}
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+	return kputsn(p, strlen(p), s);
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+	if (s->l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	s->s[s->l++] = c;
+	s->s[s->l] = 0;
+	return c;
+}
+
+static inline int kputc_(int c, kstring_t *s)
+{
+	if (s->l + 1 > s->m) {
+		char *tmp;
+		s->m = s->l + 1;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	s->s[s->l++] = c;
+	return 1;
+}
+
+static inline int kputsn_(const void *p, int l, kstring_t *s)
+{
+	if (s->l + l > s->m) {
+		char *tmp;
+		s->m = s->l + l;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l;
+	return l;
+}
+
+static inline int kputw(int c, kstring_t *s)
+{
+	char buf[16];
+	int i, l = 0;
+	unsigned int x = c;
+	if (c < 0) x = -x;
+	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
+	if (c < 0) buf[l++] = '-';
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+	char buf[16];
+	int l, i;
+	unsigned x;
+	if (c == 0) return kputc('0', s);
+	for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+static inline int kputl(long c, kstring_t *s)
+{
+	char buf[32];
+	int i, l = 0;
+	unsigned long x = c;
+	if (c < 0) x = -x;
+	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
+	if (c < 0) buf[l++] = '-';
+	if (s->l + l + 1 >= s->m) {
+		char *tmp;
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		if ((tmp = (char*)realloc(s->s, s->m)))
+			s->s = tmp;
+		else
+			return EOF;
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+/*
+ * Returns 's' split by delimiter, with *n being the number of components;
+ *         NULL on failue.
+ */
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+	int max = 0, *offsets = 0;
+	*n = ksplit_core(s->s, delimiter, &max, &offsets);
+	return offsets;
+}
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/ksw.c b/web/server/h2o/libh2o/deps/klib/ksw.c
new file mode 100644
index 000000000..742fec90b
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/ksw.c
@@ -0,0 +1,633 @@
+/* The MIT License
+
+   Copyright (c) 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <emmintrin.h>
+#include "ksw.h"
+
+#ifdef __GNUC__
+#define LIKELY(x) __builtin_expect((x),1)
+#define UNLIKELY(x) __builtin_expect((x),0)
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
+
+struct _kswq_t {
+	int qlen, slen;
+	uint8_t shift, mdiff, max, size;
+	__m128i *qp, *H0, *H1, *E, *Hmax;
+};
+
+/**
+ * Initialize the query data structure
+ *
+ * @param size   Number of bytes used to store a score; valid valures are 1 or 2
+ * @param qlen   Length of the query sequence
+ * @param query  Query sequence
+ * @param m      Size of the alphabet
+ * @param mat    Scoring matrix in a one-dimension array
+ *
+ * @return       Query data structure
+ */
+kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
+{
+	kswq_t *q;
+	int slen, a, tmp, p;
+
+	size = size > 1? 2 : 1;
+	p = 8 * (3 - size); // # values per __m128i
+	slen = (qlen + p - 1) / p; // segmented length
+	q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
+	q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
+	q->H0 = q->qp + slen * m;
+	q->H1 = q->H0 + slen;
+	q->E  = q->H1 + slen;
+	q->Hmax = q->E + slen;
+	q->slen = slen; q->qlen = qlen; q->size = size;
+	// compute shift
+	tmp = m * m;
+	for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score
+		if (mat[a] < (int8_t)q->shift) q->shift = mat[a];
+		if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a];
+	}
+	q->max = q->mdiff;
+	q->shift = 256 - q->shift; // NB: q->shift is uint8_t
+	q->mdiff += q->shift; // this is the difference between the min and max scores
+	// An example: p=8, qlen=19, slen=3 and segmentation:
+	//  {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}}
+	if (size == 1) {
+		int8_t *t = (int8_t*)q->qp;
+		for (a = 0; a < m; ++a) {
+			int i, k, nlen = slen * p;
+			const int8_t *ma = mat + a * m;
+			for (i = 0; i < slen; ++i)
+				for (k = i; k < nlen; k += slen) // p iterations
+					*t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift;
+		}
+	} else {
+		int16_t *t = (int16_t*)q->qp;
+		for (a = 0; a < m; ++a) {
+			int i, k, nlen = slen * p;
+			const int8_t *ma = mat + a * m;
+			for (i = 0; i < slen; ++i)
+				for (k = i; k < nlen; k += slen) // p iterations
+					*t++ = (k >= qlen? 0 : ma[query[k]]);
+		}
+	}
+	return q;
+}
+
+kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
+{
+	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
+	uint64_t *b;
+	__m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
+	kswr_t r;
+
+#define __max_16(ret, xx) do { \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
+		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
+    	(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
+	} while (0)
+
+	// initialization
+	r = g_defr;
+	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
+	m_b = n_b = 0; b = 0;
+	zero = _mm_set1_epi32(0);
+	gapoe = _mm_set1_epi8(_gapo + _gape);
+	gape = _mm_set1_epi8(_gape);
+	shift = _mm_set1_epi8(q->shift);
+	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
+	slen = q->slen;
+	for (i = 0; i < slen; ++i) {
+		_mm_store_si128(E + i, zero);
+		_mm_store_si128(H0 + i, zero);
+		_mm_store_si128(Hmax + i, zero);
+	}
+	// the core loop
+	for (i = 0; i < tlen; ++i) {
+		int j, k, cmp, imax;
+		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+		h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
+		for (j = 0; LIKELY(j < slen); ++j) {
+			/* SW cells are computed in the following order:
+			 *   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
+			 *   E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
+			 *   F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
+			 */
+			// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
+			h = _mm_adds_epu8(h, _mm_load_si128(S + j));
+			h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
+			e = _mm_load_si128(E + j); // e=E'(i,j)
+			h = _mm_max_epu8(h, e);
+			h = _mm_max_epu8(h, f); // h=H'(i,j)
+			max = _mm_max_epu8(max, h); // set max
+			_mm_store_si128(H1 + j, h); // save to H'(i,j)
+			// now compute E'(i+1,j)
+			h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo
+			e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape
+			e = _mm_max_epu8(e, h); // e=E'(i+1,j)
+			_mm_store_si128(E + j, e); // save to E'(i+1,j)
+			// now compute F'(i,j+1)
+			f = _mm_subs_epu8(f, gape);
+			f = _mm_max_epu8(f, h);
+			// get H'(i-1,j) and prepare for the next j
+			h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
+		}
+		// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
+		for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
+			f = _mm_slli_si128(f, 1);
+			for (j = 0; LIKELY(j < slen); ++j) {
+				h = _mm_load_si128(H1 + j);
+				h = _mm_max_epu8(h, f); // h=H'(i,j)
+				_mm_store_si128(H1 + j, h);
+				h = _mm_subs_epu8(h, gapoe);
+				f = _mm_subs_epu8(f, gape);
+				cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
+				if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
+			}
+		}
+end_loop16:
+		//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
+		__max_16(imax, max); // imax is the maximum number in max
+		if (imax >= minsc) { // write the b array; this condition adds branching unfornately
+			if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
+				if (n_b == m_b) {
+					m_b = m_b? m_b<<1 : 8;
+					b = (uint64_t*)realloc(b, 8 * m_b);
+				}
+				b[n_b++] = (uint64_t)imax<<32 | i;
+			} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
+		}
+		if (imax > gmax) {
+			gmax = imax; te = i; // te is the end position on the target
+			for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
+				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+			if (gmax + q->shift >= 255 || gmax >= endsc) break;
+		}
+		S = H1; H1 = H0; H0 = S; // swap H0 and H1
+	}
+	r.score = gmax + q->shift < 255? gmax : 255;
+	r.te = te;
+	if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
+		int max = -1, low, high, qlen = slen * 16;
+		uint8_t *t = (uint8_t*)Hmax;
+		for (i = 0; i < qlen; ++i, ++t)
+			if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
+		//printf("%d,%d\n", max, gmax);
+		if (b) {
+			i = (r.score + q->max - 1) / q->max;
+			low = te - i; high = te + i;
+			for (i = 0; i < n_b; ++i) {
+				int e = (int32_t)b[i];
+				if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
+					r.score2 = b[i]>>32, r.te2 = e;
+			}
+		}
+	}
+	free(b);
+	return r;
+}
+
+kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
+{
+	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
+	uint64_t *b;
+	__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
+	kswr_t r;
+
+#define __max_8(ret, xx) do { \
+		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
+		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
+		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
+    	(ret) = _mm_extract_epi16((xx), 0); \
+	} while (0)
+
+	// initialization
+	r = g_defr;
+	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
+	m_b = n_b = 0; b = 0;
+	zero = _mm_set1_epi32(0);
+	gapoe = _mm_set1_epi16(_gapo + _gape);
+	gape = _mm_set1_epi16(_gape);
+	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
+	slen = q->slen;
+	for (i = 0; i < slen; ++i) {
+		_mm_store_si128(E + i, zero);
+		_mm_store_si128(H0 + i, zero);
+		_mm_store_si128(Hmax + i, zero);
+	}
+	// the core loop
+	for (i = 0; i < tlen; ++i) {
+		int j, k, imax;
+		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+		h = _mm_slli_si128(h, 2);
+		for (j = 0; LIKELY(j < slen); ++j) {
+			h = _mm_adds_epi16(h, *S++);
+			e = _mm_load_si128(E + j);
+			h = _mm_max_epi16(h, e);
+			h = _mm_max_epi16(h, f);
+			max = _mm_max_epi16(max, h);
+			_mm_store_si128(H1 + j, h);
+			h = _mm_subs_epu16(h, gapoe);
+			e = _mm_subs_epu16(e, gape);
+			e = _mm_max_epi16(e, h);
+			_mm_store_si128(E + j, e);
+			f = _mm_subs_epu16(f, gape);
+			f = _mm_max_epi16(f, h);
+			h = _mm_load_si128(H0 + j);
+		}
+		for (k = 0; LIKELY(k < 16); ++k) {
+			f = _mm_slli_si128(f, 2);
+			for (j = 0; LIKELY(j < slen); ++j) {
+				h = _mm_load_si128(H1 + j);
+				h = _mm_max_epi16(h, f);
+				_mm_store_si128(H1 + j, h);
+				h = _mm_subs_epu16(h, gapoe);
+				f = _mm_subs_epu16(f, gape);
+				if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
+			}
+		}
+end_loop8:
+		__max_8(imax, max);
+		if (imax >= minsc) {
+			if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
+				if (n_b == m_b) {
+					m_b = m_b? m_b<<1 : 8;
+					b = (uint64_t*)realloc(b, 8 * m_b);
+				}
+				b[n_b++] = (uint64_t)imax<<32 | i;
+			} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
+		}
+		if (imax > gmax) {
+			gmax = imax; te = i;
+			for (j = 0; LIKELY(j < slen); ++j)
+				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+			if (gmax >= endsc) break;
+		}
+		S = H1; H1 = H0; H0 = S;
+	}
+	r.score = gmax; r.te = te;
+	{
+		int max = -1, low, high, qlen = slen * 8;
+		uint16_t *t = (uint16_t*)Hmax;
+		for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
+			if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
+		if (b) {
+			i = (r.score + q->max - 1) / q->max;
+			low = te - i; high = te + i;
+			for (i = 0; i < n_b; ++i) {
+				int e = (int32_t)b[i];
+				if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
+					r.score2 = b[i]>>32, r.te2 = e;
+			}
+		}
+	}
+	free(b);
+	return r;
+}
+
+static void revseq(int l, uint8_t *s)
+{
+	int i, t;
+	for (i = 0; i < l>>1; ++i)
+		t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
+}
+
+kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
+{
+	int size;
+	kswq_t *q;
+	kswr_t r, rr;
+	kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int);
+
+	q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
+	if (qry && *qry == 0) *qry = q;
+	func = q->size == 2? ksw_i16 : ksw_u8;
+	size = q->size;
+	r = func(q, tlen, target, gapo, gape, xtra);
+	if (qry == 0) free(q);
+	if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
+	revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
+	q = ksw_qinit(size, r.qe + 1, query, m, mat);
+	rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score);
+	revseq(r.qe + 1, query); revseq(r.te + 1, target);
+	free(q);
+	if (r.score == rr.score)
+		r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
+	return r;
+}
+
+/********************
+ *** SW extension ***
+ ********************/
+
+typedef struct {
+	int32_t h, e;
+} eh_t;
+
+int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle)
+{
+	eh_t *eh; // score array
+	int8_t *qp; // query profile
+	int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap;
+	if (h0 < 0) h0 = 0;
+	// allocate memory
+	qp = malloc(qlen * m);
+	eh = calloc(qlen + 1, 8);
+	// generate the query profile
+	for (k = i = 0; k < m; ++k) {
+		const int8_t *p = &mat[k * m];
+		for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
+	}
+	// fill the first row
+	eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
+	for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
+		eh[j].h = eh[j-1].h - gape;
+	// adjust $w if it is too large
+	k = m * m;
+	for (i = 0, max = 0; i < k; ++i) // get the max score
+		max = max > mat[i]? max : mat[i];
+	max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
+	max_gap = max_gap > 1? max_gap : 1;
+	w = w < max_gap? w : max_gap;
+	// DP loop
+	max = h0, max_i = max_j = -1;
+	beg = 0, end = qlen;
+	for (i = 0; LIKELY(i < tlen); ++i) {
+		int f = 0, h1, m = 0, mj = -1;
+		int8_t *q = &qp[target[i] * qlen];
+		// compute the first column
+		h1 = h0 - (gapo + gape * (i + 1));
+		if (h1 < 0) h1 = 0;
+		// apply the band and the constraint (if provided)
+		if (beg < i - w) beg = i - w;
+		if (end > i + w + 1) end = i + w + 1;
+		if (end > qlen) end = qlen;
+		for (j = beg; LIKELY(j < end); ++j) {
+			// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
+			// Similar to SSE2-SW, cells are computed in the following order:
+			//   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
+			//   E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
+			//   F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
+			eh_t *p = &eh[j];
+			int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
+			p->h = h1;          // set H(i,j-1) for the next row
+			h += q[j];
+			h = h > e? h : e;
+			h = h > f? h : f;
+			h1 = h;             // save H(i,j) to h1 for the next column
+			mj = m > h? mj : j;
+			m = m > h? m : h;   // m is stored at eh[mj+1]
+			h -= gapoe;
+			h = h > 0? h : 0;
+			e -= gape;
+			e = e > h? e : h;   // computed E(i+1,j)
+			p->e = e;           // save E(i+1,j) for the next row
+			f -= gape;
+			f = f > h? f : h;   // computed F(i,j+1)
+		}
+		eh[end].h = h1; eh[end].e = 0;
+		if (m == 0) break;
+		if (m > max) max = m, max_i = i, max_j = mj;
+		// update beg and end for the next round
+		for (j = mj; j >= beg && eh[j].h; --j);
+		beg = j + 1;
+		for (j = mj + 2; j <= end && eh[j].h; ++j);
+		end = j;
+		//beg = 0; end = qlen; // uncomment this line for debugging
+	}
+	free(eh); free(qp);
+	if (_qle) *_qle = max_j + 1;
+	if (_tle) *_tle = max_i + 1;
+	return max;
+}
+
+/********************
+ * Global alignment *
+ ********************/
+
+#define MINUS_INF -0x40000000
+
+static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
+{
+	if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
+		if (*n_cigar == *m_cigar) {
+			*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
+			cigar = realloc(cigar, (*m_cigar) << 2);
+		}
+		cigar[(*n_cigar)++] = len<<4 | op;
+	} else cigar[(*n_cigar)-1] += len<<4;
+	return cigar;
+}
+
+int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+{
+	eh_t *eh;
+	int8_t *qp; // query profile
+	int i, j, k, gapoe = gapo + gape, score, n_col;
+	uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
+	if (n_cigar_) *n_cigar_ = 0;
+	// allocate memory
+	n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
+	z = malloc(n_col * tlen);
+	qp = malloc(qlen * m);
+	eh = calloc(qlen + 1, 8);
+	// generate the query profile
+	for (k = i = 0; k < m; ++k) {
+		const int8_t *p = &mat[k * m];
+		for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
+	}
+	// fill the first row
+	eh[0].h = 0; eh[0].e = MINUS_INF;
+	for (j = 1; j <= qlen && j <= w; ++j)
+		eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF;
+	for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
+	// DP loop
+	for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
+		int32_t f = MINUS_INF, h1, beg, end;
+		int8_t *q = &qp[target[i] * qlen];
+		uint8_t *zi = &z[i * n_col];
+		beg = i > w? i - w : 0;
+		end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
+		h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF;
+		for (j = beg; LIKELY(j < end); ++j) {
+			// This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except:
+			// 1) not checking h>0; 2) recording direction for backtracking
+			eh_t *p = &eh[j];
+			int32_t h = p->h, e = p->e;
+			uint8_t d; // direction
+			p->h = h1;
+			h += q[j];
+			d = h > e? 0 : 1;
+			h = h > e? h : e;
+			d = h > f? d : 2;
+			h = h > f? h : f;
+			h1 = h;
+			h -= gapoe;
+			e -= gape;
+			d |= e > h? 1<<2 : 0;
+			e = e > h? e : h;
+			p->e = e;
+			f -= gape;
+			d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
+			f = f > h? f : h;
+			zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
+		}
+		eh[end].h = h1; eh[end].e = MINUS_INF;
+	}
+	score = eh[qlen].h;
+	if (n_cigar_ && cigar_) { // backtrack
+		int n_cigar = 0, m_cigar = 0, which = 0;
+		uint32_t *cigar = 0, tmp;
+		i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
+		while (i >= 0 && k >= 0) {
+			which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
+			if (which == 0)      cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
+			else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
+			else                 cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
+		}
+		if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
+		if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
+		for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
+			tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
+		*n_cigar_ = n_cigar, *cigar_ = cigar;
+	}
+	free(eh); free(qp); free(z);
+	return score;
+}
+
+/*******************************************
+ * Main function (not compiled by default) *
+ *******************************************/
+
+#ifdef _KSW_MAIN
+
+#include <unistd.h>
+#include <stdio.h>
+#include <zlib.h>
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+unsigned char seq_nt4_table[256] = {
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
+	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
+};
+
+int main(int argc, char *argv[])
+{
+	int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
+	int8_t mat[25];
+	int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
+	uint8_t *rseq = 0;
+	gzFile fpt, fpq;
+	kseq_t *kst, *ksq;
+
+	// parse command line
+	while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
+		switch (c) {
+			case 'a': sa = atoi(optarg); break;
+			case 'b': sb = atoi(optarg); break;
+			case 'q': gapo = atoi(optarg); break;
+			case 'r': gape = atoi(optarg); break;
+			case 't': minsc = atoi(optarg); break;
+			case 'f': forward_only = 1; break;
+			case '1': xtra |= KSW_XBYTE; break;
+		}
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
+		return 1;
+	}
+	if (minsc > 0xffff) minsc = 0xffff;
+	xtra |= KSW_XSUBO | minsc;
+	// initialize scoring matrix
+	for (i = k = 0; i < 4; ++i) {
+		for (j = 0; j < 4; ++j)
+			mat[k++] = i == j? sa : -sb;
+		mat[k++] = 0; // ambiguous base
+	}
+	for (j = 0; j < 5; ++j) mat[k++] = 0;
+	// open file
+	fpt = gzopen(argv[optind],   "r"); kst = kseq_init(fpt);
+	fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
+	// all-pair alignment
+	while (kseq_read(ksq) > 0) {
+		kswq_t *q[2] = {0, 0};
+		kswr_t r;
+		for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
+		if (!forward_only) { // reverse
+			if ((int)ksq->seq.m > max_rseq) {
+				max_rseq = ksq->seq.m;
+				rseq = (uint8_t*)realloc(rseq, max_rseq);
+			}
+			for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
+				rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
+		}
+		gzrewind(fpt); kseq_rewind(kst);
+		while (kseq_read(kst) > 0) {
+			for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
+			r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
+			if (r.score >= minsc)
+				printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
+			if (rseq) {
+				r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
+				if (r.score >= minsc)
+					printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
+			}
+		}
+		free(q[0]); free(q[1]);
+	}
+	free(rseq);
+	kseq_destroy(kst); gzclose(fpt);
+	kseq_destroy(ksq); gzclose(fpq);
+	return 0;
+}
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/ksw.h b/web/server/h2o/libh2o/deps/klib/ksw.h
new file mode 100644
index 000000000..5162dc03d
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/ksw.h
@@ -0,0 +1,72 @@
+#ifndef __AC_KSW_H
+#define __AC_KSW_H
+
+#include <stdint.h>
+
+#define KSW_XBYTE  0x10000
+#define KSW_XSTOP  0x20000
+#define KSW_XSUBO  0x40000
+#define KSW_XSTART 0x80000
+
+struct _kswq_t;
+typedef struct _kswq_t kswq_t;
+
+typedef struct {
+	int score; // best score
+	int te, qe; // target end and query end
+	int score2, te2; // second best score and ending position on the target
+	int tb, qb; // target start and query start
+} kswr_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/**
+	 * Aligning two sequences
+	 *
+	 * @param qlen    length of the query sequence (typically <tlen)
+	 * @param query   query sequence with 0 <= query[i] < m
+	 * @param tlen    length of the target sequence
+	 * @param target  target sequence
+	 * @param m       number of residue types
+	 * @param mat     m*m scoring matrix in one-dimention array
+	 * @param gapo    gap open penalty; a gap of length l cost "-(gapo+l*gape)"
+	 * @param gape    gap extension penalty
+	 * @param xtra    extra information (see below)
+	 * @param qry     query profile (see below)
+	 *
+	 * @return        alignment information in a struct; unset values to -1
+	 *
+	 * When xtra==0, ksw_align() uses a signed two-byte integer to store a
+	 * score and only finds the best score and the end positions. The 2nd best
+	 * score or the start positions are not attempted. The default behavior can
+	 * be tuned by setting KSW_X* flags:
+	 *
+	 *   KSW_XBYTE:  use an unsigned byte to store a score. If overflow occurs,
+	 *               kswr_t::score will be set to 255
+	 *
+	 *   KSW_XSUBO:  track the 2nd best score and the ending position on the
+	 *               target if the 2nd best is higher than (xtra&0xffff)
+	 *
+	 *   KSW_XSTOP:  stop if the maximum score is above (xtra&0xffff)
+	 *
+	 *   KSW_XSTART: find the start positions
+	 *
+	 * When *qry==NULL, ksw_align() will compute and allocate the query profile
+	 * and when the function returns, *qry will point to the profile, which can
+	 * be deallocated simply by free(). If one query is aligned against multiple
+	 * target sequences, *qry should be set to NULL during the first call and
+	 * freed after the last call. Note that qry can equal 0. In this case, the
+	 * query profile will be deallocated in ksw_align().
+	 */
+	kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
+
+	int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle);
+	int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kthread.c b/web/server/h2o/libh2o/deps/klib/kthread.c
new file mode 100644
index 000000000..80f84cb35
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kthread.c
@@ -0,0 +1,143 @@
+#include <pthread.h>
+#include <stdlib.h>
+#include <limits.h>
+
+/************
+ * kt_for() *
+ ************/
+
+struct kt_for_t;
+
+typedef struct {
+	struct kt_for_t *t;
+	long i;
+} ktf_worker_t;
+
+typedef struct kt_for_t {
+	int n_threads;
+	long n;
+	ktf_worker_t *w;
+	void (*func)(void*,long,int);
+	void *data;
+} kt_for_t;
+
+static inline long steal_work(kt_for_t *t)
+{
+	int i, min_i = -1;
+	long k, min = LONG_MAX;
+	for (i = 0; i < t->n_threads; ++i)
+		if (min > t->w[i].i) min = t->w[i].i, min_i = i;
+	k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
+	return k >= t->n? -1 : k;
+}
+
+static void *ktf_worker(void *data)
+{
+	ktf_worker_t *w = (ktf_worker_t*)data;
+	long i;
+	for (;;) {
+		i = __sync_fetch_and_add(&w->i, w->t->n_threads);
+		if (i >= w->t->n) break;
+		w->t->func(w->t->data, i, w - w->t->w);
+	}
+	while ((i = steal_work(w->t)) >= 0)
+		w->t->func(w->t->data, i, w - w->t->w);
+	pthread_exit(0);
+}
+
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
+{
+	int i;
+	kt_for_t t;
+	pthread_t *tid;
+	t.func = func, t.data = data, t.n_threads = n_threads, t.n = n;
+	t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t));
+	tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t));
+	for (i = 0; i < n_threads; ++i)
+		t.w[i].t = &t, t.w[i].i = i;
+	for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
+	for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+}
+
+/*****************
+ * kt_pipeline() *
+ *****************/
+
+struct ktp_t;
+
+typedef struct {
+	struct ktp_t *pl;
+	int step, running;
+	void *data;
+} ktp_worker_t;
+
+typedef struct ktp_t {
+	void *shared;
+	void *(*func)(void*, int, void*);
+	int n_workers, n_steps;
+	ktp_worker_t *workers;
+	pthread_mutex_t mutex;
+	pthread_cond_t cv;
+} ktp_t;
+
+static void *ktp_worker(void *data)
+{
+	ktp_worker_t *w = (ktp_worker_t*)data;
+	ktp_t *p = w->pl;
+	while (w->step < p->n_steps) {
+		// test whether we can kick off the job with this worker
+		pthread_mutex_lock(&p->mutex);
+		for (;;) {
+			int i;
+			// test whether another worker is doing the same step
+			for (i = 0; i < p->n_workers; ++i) {
+				if (w == &p->workers[i]) continue; // ignore itself
+				if (p->workers[i].running && p->workers[i].step == w->step)
+					break;
+			}
+			if (i == p->n_workers) break; // no other workers doing w->step; then this worker will
+			pthread_cond_wait(&p->cv, &p->mutex);
+		}
+		w->running = 1;
+		pthread_mutex_unlock(&p->mutex);
+
+		// working on w->step
+		w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
+
+		// update step and let other workers know
+		pthread_mutex_lock(&p->mutex);
+		w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
+		w->running = 0;
+		pthread_cond_broadcast(&p->cv);
+		pthread_mutex_unlock(&p->mutex);
+	}
+	pthread_exit(0);
+}
+
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
+{
+	ktp_t aux;
+	pthread_t *tid;
+	int i;
+
+	if (n_threads < 1) n_threads = 1;
+	aux.n_workers = n_threads;
+	aux.n_steps = n_steps;
+	aux.func = func;
+	aux.shared = shared_data;
+	pthread_mutex_init(&aux.mutex, 0);
+	pthread_cond_init(&aux.cv, 0);
+
+	aux.workers = alloca(n_threads * sizeof(ktp_worker_t));
+	for (i = 0; i < n_threads; ++i) {
+		ktp_worker_t *w = &aux.workers[i];
+		w->step = w->running = 0; w->pl = &aux; w->data = 0;
+	}
+
+	tid = alloca(n_threads * sizeof(pthread_t));
+	for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
+	for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+
+	pthread_mutex_destroy(&aux.mutex);
+	pthread_cond_destroy(&aux.cv);
+}
diff --git a/web/server/h2o/libh2o/deps/klib/kurl.c b/web/server/h2o/libh2o/deps/klib/kurl.c
new file mode 100644
index 000000000..3bf92901c
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kurl.c
@@ -0,0 +1,583 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <curl/curl.h>
+#include "kurl.h"
+
+/**********************
+ *** Core kurl APIs ***
+ **********************/
+
+#define KU_DEF_BUFLEN   0x8000
+#define KU_MAX_SKIP     (KU_DEF_BUFLEN<<1) // if seek step is smaller than this, skip
+
+#define kurl_isfile(u) ((u)->fd >= 0)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+struct kurl_t {
+	CURLM *multi; // cURL multi handler
+	CURL *curl;   // cURL easy handle
+	uint8_t *buf; // buffer
+	off_t off0;   // offset of the first byte in the buffer; the actual file offset equals off0 + p_buf
+	int fd;       // file descriptor for a normal file; <0 for a remote file
+	int m_buf;    // max buffer size; for a remote file, CURL_MAX_WRITE_SIZE*2 is recommended
+	int l_buf;    // length of the buffer; l_buf == 0 iff the input read entirely; l_buf <= m_buf
+	int p_buf;    // file position in the buffer; p_buf <= l_buf
+	int done_reading; // true if we can read nothing from the file; buffer may not be empty even if done_reading is set
+	int err;      // error code
+	struct curl_slist *hdr;
+};
+
+typedef struct {
+	char *url, *date, *auth;
+} s3aux_t;
+
+int kurl_init(void) // required for SSL and win32 socket; NOT thread safe
+{
+	return curl_global_init(CURL_GLOBAL_DEFAULT);
+}
+
+void kurl_destroy(void)
+{
+	curl_global_cleanup();
+}
+
+static int prepare(kurl_t *ku, int do_seek)
+{
+	if (kurl_isfile(ku)) {
+		if (do_seek && lseek(ku->fd, ku->off0, SEEK_SET) != ku->off0)
+			return -1;
+	} else { // FIXME: for S3, we need to re-authorize
+		int rc;
+		rc = curl_multi_remove_handle(ku->multi, ku->curl);
+		rc = curl_easy_setopt(ku->curl, CURLOPT_RESUME_FROM, ku->off0);
+		rc = curl_multi_add_handle(ku->multi, ku->curl);
+	}
+	ku->p_buf = ku->l_buf = 0; // empty the buffer
+	return 0;
+}
+
+static size_t write_cb(char *ptr, size_t size, size_t nmemb, void *data) // callback required by cURL
+{
+	kurl_t *ku = (kurl_t*)data;
+	ssize_t nbytes = size * nmemb;
+	if (nbytes + ku->l_buf > ku->m_buf)
+		return CURL_WRITEFUNC_PAUSE;
+	memcpy(ku->buf + ku->l_buf, ptr, nbytes);
+	ku->l_buf += nbytes;
+	return nbytes;
+}
+
+static int fill_buffer(kurl_t *ku) // fill the buffer
+{
+	assert(ku->p_buf == ku->l_buf); // buffer is always used up when fill_buffer() is called; otherwise a bug
+	ku->off0 += ku->l_buf;
+	ku->p_buf = ku->l_buf = 0;
+	if (ku->done_reading) return 0;
+	if (kurl_isfile(ku)) {
+		// The following block is equivalent to "ku->l_buf = read(ku->fd, ku->buf, ku->m_buf)" on Mac.
+		// On Linux, the man page does not specify whether read() guarantees to read ku->m_buf bytes
+		// even if ->fd references a normal file with sufficient remaining bytes.
+		while (ku->l_buf < ku->m_buf) {
+			int l;
+			l = read(ku->fd, ku->buf + ku->l_buf, ku->m_buf - ku->l_buf);
+			if (l == 0) break;
+			ku->l_buf += l;
+		}
+		if (ku->l_buf < ku->m_buf) ku->done_reading = 1;
+	} else {
+		int n_running, rc;
+		fd_set fdr, fdw, fde;
+		do {
+			int maxfd = -1;
+			long curl_to = -1;
+			struct timeval to;
+			// the following is adaped from docs/examples/fopen.c 
+			to.tv_sec = 10, to.tv_usec = 0; // 10 seconds
+			curl_multi_timeout(ku->multi, &curl_to);
+			if (curl_to >= 0) {
+				to.tv_sec = curl_to / 1000;
+				if (to.tv_sec > 1) to.tv_sec = 1;
+				else to.tv_usec = (curl_to % 1000) * 1000;
+			}
+			FD_ZERO(&fdr); FD_ZERO(&fdw); FD_ZERO(&fde);
+			curl_multi_fdset(ku->multi, &fdr, &fdw, &fde, &maxfd); // FIXME: check return code
+			if (maxfd >= 0 && (rc = select(maxfd+1, &fdr, &fdw, &fde, &to)) < 0) break;
+			if (maxfd < 0) { // check curl_multi_fdset.3 about why we wait for 100ms here
+				struct timespec req, rem;
+				req.tv_sec = 0; req.tv_nsec = 100000000; // this is 100ms
+				nanosleep(&req, &rem);
+			}
+			curl_easy_pause(ku->curl, CURLPAUSE_CONT);
+			rc = curl_multi_perform(ku->multi, &n_running); // FIXME: check return code
+		} while (n_running && ku->l_buf < ku->m_buf - CURL_MAX_WRITE_SIZE);
+		if (ku->l_buf < ku->m_buf - CURL_MAX_WRITE_SIZE) ku->done_reading = 1;
+	}
+	return ku->l_buf;
+}
+
+int kurl_close(kurl_t *ku)
+{
+	if (ku == 0) return 0;
+	if (ku->fd < 0) {
+		curl_multi_remove_handle(ku->multi, ku->curl);
+		curl_easy_cleanup(ku->curl);
+		curl_multi_cleanup(ku->multi);
+		if (ku->hdr) curl_slist_free_all(ku->hdr);
+	} else close(ku->fd);
+	free(ku->buf);
+	free(ku);
+	return 0;
+}
+
+kurl_t *kurl_open(const char *url, kurl_opt_t *opt)
+{
+	extern s3aux_t s3_parse(const char *url, const char *_id, const char *_secret, const char *fn);
+	const char *p, *q;
+	kurl_t *ku;
+	int fd = -1, is_file = 1, failed = 0;
+
+	p = strstr(url, "://");
+	if (p && *p) {
+		for (q = url; q != p; ++q)
+			if (!isalnum(*q)) break;
+		if (q == p) is_file = 0;
+	}
+	if (is_file && (fd = open(url, O_RDONLY)) < 0) return 0;
+
+	ku = (kurl_t*)calloc(1, sizeof(kurl_t));
+	ku->fd = is_file? fd : -1;
+	if (!kurl_isfile(ku)) {
+		ku->multi = curl_multi_init();
+		ku->curl  = curl_easy_init();
+		if (strstr(url, "s3://") == url) {
+			s3aux_t a;
+			a = s3_parse(url, (opt? opt->s3keyid : 0), (opt? opt->s3secretkey : 0), (opt? opt->s3key_fn : 0));
+			if (a.url == 0 || a.date == 0 || a.auth == 0) {
+				kurl_close(ku);
+				return 0;
+			}
+			ku->hdr = curl_slist_append(ku->hdr, a.date);
+			ku->hdr = curl_slist_append(ku->hdr, a.auth);
+			curl_easy_setopt(ku->curl, CURLOPT_URL, a.url);
+			curl_easy_setopt(ku->curl, CURLOPT_HTTPHEADER, ku->hdr);
+			free(a.date); free(a.auth); free(a.url);
+		} else curl_easy_setopt(ku->curl, CURLOPT_URL, url);
+		curl_easy_setopt(ku->curl, CURLOPT_WRITEDATA, ku);
+		curl_easy_setopt(ku->curl, CURLOPT_VERBOSE, 0L);
+		curl_easy_setopt(ku->curl, CURLOPT_NOSIGNAL, 1L);
+		curl_easy_setopt(ku->curl, CURLOPT_WRITEFUNCTION, write_cb);
+		curl_easy_setopt(ku->curl, CURLOPT_SSL_VERIFYPEER, 0L);
+		curl_easy_setopt(ku->curl, CURLOPT_SSL_VERIFYHOST, 0L);
+		curl_easy_setopt(ku->curl, CURLOPT_FOLLOWLOCATION, 1L);
+	}
+	ku->m_buf = KU_DEF_BUFLEN;
+	if (!kurl_isfile(ku) && ku->m_buf < CURL_MAX_WRITE_SIZE * 2)
+		ku->m_buf = CURL_MAX_WRITE_SIZE * 2; // for remote files, the buffer set to 2*CURL_MAX_WRITE_SIZE
+	ku->buf = (uint8_t*)calloc(ku->m_buf, 1);
+	if (kurl_isfile(ku)) failed = (fill_buffer(ku) <= 0);
+	else failed = (prepare(ku, 0) < 0 || fill_buffer(ku) <= 0);
+	if (failed) {
+		kurl_close(ku);
+		return 0;
+	}
+	return ku;
+}
+
+kurl_t *kurl_dopen(int fd)
+{
+	kurl_t *ku;
+	ku = (kurl_t*)calloc(1, sizeof(kurl_t));
+	ku->fd = fd;
+	ku->m_buf = KU_DEF_BUFLEN;
+	ku->buf = (uint8_t*)calloc(ku->m_buf, 1);
+	if (prepare(ku, 0) < 0 || fill_buffer(ku) <= 0) {
+		kurl_close(ku);
+		return 0;
+	}
+	return ku;
+}
+
+int kurl_buflen(kurl_t *ku, int len)
+{
+	if (len <= 0 || len < ku->l_buf) return ku->m_buf;
+	if (!kurl_isfile(ku) && len < CURL_MAX_WRITE_SIZE * 2) return ku->m_buf;
+	ku->m_buf = len;
+	kroundup32(ku->m_buf);
+	ku->buf = (uint8_t*)realloc(ku->buf, ku->m_buf);
+	return ku->m_buf;
+}
+
+ssize_t kurl_read(kurl_t *ku, void *buf, size_t nbytes)
+{
+	ssize_t rest = nbytes;
+	if (ku->l_buf == 0) return 0; // end-of-file
+	while (rest) {
+		if (ku->l_buf - ku->p_buf >= rest) {
+			if (buf) memcpy((uint8_t*)buf + (nbytes - rest), ku->buf + ku->p_buf, rest);
+			ku->p_buf += rest;
+			rest = 0;
+		} else {
+			int ret;
+			if (buf && ku->l_buf > ku->p_buf)
+				memcpy((uint8_t*)buf + (nbytes - rest), ku->buf + ku->p_buf, ku->l_buf - ku->p_buf);
+			rest -= ku->l_buf - ku->p_buf;
+			ku->p_buf = ku->l_buf;
+			ret = fill_buffer(ku);
+			if (ret <= 0) break;
+		}
+	}
+	return nbytes - rest;
+}
+
+off_t kurl_seek(kurl_t *ku, off_t offset, int whence) // FIXME: sometimes when seek() fails, read() will fail as well.
+{
+	off_t new_off = -1, cur_off;
+	int failed = 0, seek_end = 0;
+	if (ku == 0) return -1;
+	cur_off = ku->off0 + ku->p_buf;
+	if (whence == SEEK_SET) new_off = offset;
+	else if (whence == SEEK_CUR) new_off += cur_off + offset;
+	else if (whence == SEEK_END && kurl_isfile(ku)) new_off = lseek(ku->fd, offset, SEEK_END), seek_end = 1;
+	else { // not supported whence
+		ku->err = KURL_INV_WHENCE;
+		return -1;
+	}
+	if (new_off < 0) { // negtive absolute offset
+		ku->err = KURL_SEEK_OUT;
+		return -1;
+	}
+	if (!seek_end && new_off >= cur_off && new_off - cur_off + ku->p_buf < ku->l_buf) {
+		ku->p_buf += new_off - cur_off;
+		return ku->off0 + ku->p_buf;
+	}
+	if (seek_end || new_off < cur_off || new_off - cur_off > KU_MAX_SKIP) { // if jump is large, do actual seek
+		ku->off0 = new_off;
+		ku->done_reading = 0;
+		if (prepare(ku, 1) < 0 || fill_buffer(ku) <= 0) failed = 1;
+	} else { // if jump is small, read through
+		off_t r;
+		r = kurl_read(ku, 0, new_off - cur_off);
+		if (r + cur_off != new_off) failed = 1; // out of range
+	}
+	if (failed) ku->err = KURL_SEEK_OUT, ku->l_buf = ku->p_buf = 0, new_off = -1;
+	return new_off;
+}
+
+off_t kurl_tell(const kurl_t *ku)
+{
+	if (ku == 0) return -1;
+	return ku->off0 + ku->p_buf;
+}
+
+int kurl_eof(const kurl_t *ku)
+{
+	if (ku == 0) return 1;
+	return (ku->l_buf == 0); // unless file end, buffer should never be empty
+}
+
+int kurl_fileno(const kurl_t *ku)
+{
+	if (ku == 0) return -1;
+	return ku->fd;
+}
+
+int kurl_error(const kurl_t *ku)
+{
+	if (ku == 0) return KURL_NULL;
+	return ku->err;
+}
+
+/*****************
+ *** HMAC-SHA1 ***
+ *****************/
+
+/* This code is public-domain - it is based on libcrypt placed in the public domain by Wei Dai and other contributors. */
+
+#define HASH_LENGTH 20
+#define BLOCK_LENGTH 64
+
+typedef struct sha1nfo {
+	union { uint8_t b[BLOCK_LENGTH]; uint32_t w[BLOCK_LENGTH/4]; } buf;
+	uint8_t bufOffset;
+	union { uint8_t b[HASH_LENGTH]; uint32_t w[HASH_LENGTH/4]; } state;
+	uint32_t byteCount;
+	uint8_t keyBuffer[BLOCK_LENGTH];
+	uint8_t innerHash[HASH_LENGTH];
+} sha1nfo;
+
+void sha1_init(sha1nfo *s)
+{
+	const uint8_t table[] = { 0x01,0x23,0x45,0x67, 0x89,0xab,0xcd,0xef, 0xfe,0xdc,0xba,0x98, 0x76,0x54,0x32,0x10, 0xf0,0xe1,0xd2,0xc3 };
+	memcpy(s->state.b, table, HASH_LENGTH);
+	s->byteCount = 0;
+	s->bufOffset = 0;
+}
+
+#define rol32(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
+static void sha1_hashBlock(sha1nfo *s)
+{
+	uint32_t i, t, a = s->state.w[0], b = s->state.w[1], c = s->state.w[2], d = s->state.w[3], e = s->state.w[4];
+	for (i = 0; i < 80; i++) {
+		if (i >= 16) {
+			t = s->buf.w[(i+13)&15] ^ s->buf.w[(i+8)&15] ^ s->buf.w[(i+2)&15] ^ s->buf.w[i&15];
+			s->buf.w[i&15] = rol32(t, 1);
+		}
+		if (i < 20)      t = 0x5a827999 + (d ^ (b & (c ^ d)));
+		else if (i < 40) t = 0x6ed9eba1 + (b ^ c ^ d);
+		else if (i < 60) t = 0x8f1bbcdc + ((b & c) | (d & (b | c)));
+		else             t = 0xca62c1d6 + (b ^ c ^ d);
+		t += rol32(a, 5) + e + s->buf.w[i&15];
+		e = d; d = c; c = rol32(b, 30); b = a; a = t;
+	}
+	s->state.w[0] += a; s->state.w[1] += b; s->state.w[2] += c; s->state.w[3] += d; s->state.w[4] += e;
+}
+
+static inline void sha1_add(sha1nfo *s, uint8_t data)
+{
+	s->buf.b[s->bufOffset ^ 3] = data;
+	if (++s->bufOffset == BLOCK_LENGTH) {
+		sha1_hashBlock(s);
+		s->bufOffset = 0;
+	}
+}
+
+void sha1_write1(sha1nfo *s, uint8_t data)
+{
+	++s->byteCount;
+	sha1_add(s, data);
+}
+
+void sha1_write(sha1nfo *s, const char *data, size_t len)
+{
+	while (len--) sha1_write1(s, (uint8_t)*data++);
+}
+
+const uint8_t *sha1_final(sha1nfo *s)
+{
+	int i;
+	sha1_add(s, 0x80);
+	while (s->bufOffset != 56) sha1_add(s, 0);
+	sha1_add(s, 0);
+	sha1_add(s, 0);
+	sha1_add(s, 0);
+	sha1_add(s, s->byteCount >> 29);
+	sha1_add(s, s->byteCount >> 21);
+	sha1_add(s, s->byteCount >> 13);
+	sha1_add(s, s->byteCount >> 5);
+	sha1_add(s, s->byteCount << 3);
+	for (i = 0; i < 5; ++i) {
+		uint32_t a = s->state.w[i];
+		s->state.w[i] = a<<24 | (a<<8&0x00ff0000) | (a>>8&0x0000ff00) | a>>24;
+	}
+	return s->state.b;
+}
+
+#define HMAC_IPAD 0x36
+#define HMAC_OPAD 0x5c
+
+void sha1_init_hmac(sha1nfo *s, const uint8_t* key, int l_key)
+{
+	uint8_t i;
+	memset(s->keyBuffer, 0, BLOCK_LENGTH);
+	if (l_key > BLOCK_LENGTH) {
+		sha1_init(s);
+		while (l_key--) sha1_write1(s, *key++);
+		memcpy(s->keyBuffer, sha1_final(s), HASH_LENGTH);
+	} else memcpy(s->keyBuffer, key, l_key);
+	sha1_init(s);
+	for (i = 0; i < BLOCK_LENGTH; ++i)
+		sha1_write1(s, s->keyBuffer[i] ^ HMAC_IPAD);
+}
+
+const uint8_t *sha1_final_hmac(sha1nfo *s)
+{
+	uint8_t i;
+	memcpy(s->innerHash, sha1_final(s), HASH_LENGTH);
+	sha1_init(s);
+	for (i = 0; i < BLOCK_LENGTH; ++i) sha1_write1(s, s->keyBuffer[i] ^ HMAC_OPAD);
+	for (i = 0; i < HASH_LENGTH; ++i) sha1_write1(s, s->innerHash[i]);
+	return sha1_final(s);
+}
+
+/*******************
+ *** S3 protocol ***
+ *******************/
+
+#include <time.h>
+#include <ctype.h>
+
+static void s3_sign(const char *key, const char *data, char out[29])
+{
+	const char *b64tab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+	const uint8_t *digest;
+	int i, j, rest;
+	sha1nfo s;
+	sha1_init_hmac(&s, (uint8_t*)key, strlen(key));
+	sha1_write(&s, data, strlen(data));
+	digest = sha1_final_hmac(&s);
+	for (j = i = 0, rest = 8; i < 20; ++j) { // base64 encoding
+		if (rest <= 6) {
+			int next = i < 19? digest[i+1] : 0;
+			out[j] = b64tab[(int)(digest[i] << (6-rest) & 0x3f) | next >> (rest+2)], ++i, rest += 2;
+		} else out[j] = b64tab[(int)digest[i] >> (rest-6) & 0x3f], rest -= 6;
+	}
+	out[j++] = '='; out[j] = 0; // SHA1 digest always has 160 bits, or 20 bytes. We need one '=' at the end.
+}
+
+static char *s3_read_awssecret(const char *fn)
+{
+	char *p, *secret, buf[128], *path;
+	FILE *fp;
+	int l;
+	if (fn == 0) {
+		char *home;
+		home = getenv("HOME");
+		if (home == 0) return 0;
+		l = strlen(home) + 12;
+		path = (char*)malloc(strlen(home) + 12);
+		strcat(strcpy(path, home), "/.awssecret");
+	} else path = (char*)fn;
+	fp = fopen(path, "r");
+	if (path != fn) free(path);
+	if (fp == 0) return 0;
+	l = fread(buf, 1, 127, fp);
+	fclose(fp);
+	buf[l] = 0;
+	for (p = buf; *p != 0 && *p != '\n'; ++p);
+	if (*p == 0) return 0;
+	*p = 0; secret = p + 1;
+	for (++p; *p != 0 && *p != '\n'; ++p);
+	*p = 0;
+	l = p - buf + 1;
+	p = (char*)malloc(l);
+	memcpy(p, buf, l);
+	return p;
+}
+
+typedef struct { int l, m; char *s; } kstring_t;
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
+{
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	memcpy(s->s + s->l, p, l);
+	s->l += l;
+	s->s[s->l] = 0;
+	return l;
+}
+
+s3aux_t s3_parse(const char *url, const char *_id, const char *_secret, const char *fn_secret)
+{
+	const char *id, *secret, *bucket, *obj;
+	char *id_secret = 0, date[64], sig[29];
+	time_t t;
+	struct tm tmt;
+	s3aux_t a = {0,0};
+	kstring_t str = {0,0,0};
+	// parse URL
+	if (strstr(url, "s3://") != url) return a;
+	bucket = url + 5;
+	for (obj = bucket; *obj && *obj != '/'; ++obj);
+	if (*obj == 0) return a; // no object
+	// acquire AWS credential and time
+	if (_id == 0 || _secret == 0) {
+		id_secret = s3_read_awssecret(fn_secret);
+		if (id_secret == 0) return a; // fail to read the AWS credential
+		id = id_secret;
+		secret = id_secret + strlen(id) + 1;
+	} else id = _id, secret = _secret;
+	// compose URL for curl
+	kputsn("https://", 8, &str);
+	kputsn(bucket, obj - bucket, &str);
+	kputsn(".s3.amazonaws.com", 17, &str);
+	kputsn(obj, strlen(obj), &str);
+	a.url = str.s;
+	// compose the Date line
+	str.l = str.m = 0; str.s = 0;
+	t = time(0);
+	strftime(date, 64, "%a, %d %b %Y %H:%M:%S +0000", gmtime_r(&t, &tmt));
+	kputsn("Date: ", 6, &str);
+	kputsn(date, strlen(date), &str);
+	a.date = str.s;
+	// compose the string to sign and sign it
+	str.l = str.m = 0; str.s = 0;
+	kputsn("GET\n\n\n", 6, &str);
+	kputsn(date, strlen(date), &str);
+	kputsn("\n", 1, &str);
+	kputsn(bucket-1, strlen(bucket-1), &str);
+	s3_sign(secret, str.s, sig);
+	// compose the Authorization line
+	str.l = 0;
+	kputsn("Authorization: AWS ", 19, &str);
+	kputsn(id, strlen(id), &str);
+	kputsn(":", 1, &str);
+	kputsn(sig, strlen(sig), &str);
+	a.auth = str.s;
+//	printf("curl -H '%s' -H '%s' %s\n", a.date, a.auth, a.url);
+	return a;
+}
+
+/*********************
+ *** Main function ***
+ *********************/
+
+#ifdef KURL_MAIN
+int main(int argc, char *argv[])
+{
+	kurl_t *f;
+	int c, l, l_buf = 0x10000;
+	off_t start = 0, rest = -1;
+	uint8_t *buf;
+	char *p;
+	kurl_opt_t opt;
+
+	memset(&opt, 0, sizeof(kurl_opt_t));
+	while ((c = getopt(argc, argv, "c:l:a:")) >= 0) {
+		if (c == 'c') start = strtol(optarg, &p, 0);
+		else if (c == 'l') rest = strtol(optarg, &p, 0);
+		else if (c == 'a') opt.s3key_fn = optarg;
+	}
+	if (optind == argc) {
+		fprintf(stderr, "Usage: kurl [-c start] [-l length] <url>\n");
+		return 1;
+	}
+	kurl_init();
+	f = kurl_open(argv[optind], &opt);
+	if (f == 0) {
+		fprintf(stderr, "ERROR: fail to open URL\n");
+		return 2;
+	}
+	if (start > 0) {
+		if (kurl_seek(f, start, SEEK_SET) < 0) {
+			kurl_close(f);
+			fprintf(stderr, "ERROR: fail to seek\n");
+			return 3;
+		}
+	}
+	buf = (uint8_t*)calloc(l_buf, 1);
+	while (rest != 0) {
+		int to_read = rest > 0 && rest < l_buf? rest : l_buf;
+		l = kurl_read(f, buf, to_read);
+		if (l == 0) break;
+		fwrite(buf, 1, l, stdout);
+		rest -= l;
+	}
+	free(buf);
+	kurl_close(f);
+	kurl_destroy();
+	return 0;
+}
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kurl.h b/web/server/h2o/libh2o/deps/klib/kurl.h
new file mode 100644
index 000000000..f07f64118
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kurl.h
@@ -0,0 +1,57 @@
+#ifndef KURL_H
+#define KURL_H
+
+#include <sys/types.h>
+
+#define KURL_NULL       1
+#define KURL_INV_WHENCE 2
+#define KURL_SEEK_OUT   3
+#define KURL_NO_AUTH    4
+
+struct kurl_t;
+typedef struct kurl_t kurl_t;
+
+typedef struct {
+	const char *s3keyid;
+	const char *s3secretkey;
+	const char *s3key_fn;
+} kurl_opt_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int kurl_init(void);
+void kurl_destroy(void);
+
+kurl_t *kurl_open(const char *url, kurl_opt_t *opt);
+kurl_t *kurl_dopen(int fd);
+int kurl_close(kurl_t *ku);
+ssize_t kurl_read(kurl_t *ku, void *buf, size_t nbytes);
+off_t kurl_seek(kurl_t *ku, off_t offset, int whence);
+int kurl_buflen(kurl_t *ku, int len);
+
+off_t kurl_tell(const kurl_t *ku);
+int kurl_eof(const kurl_t *ku);
+int kurl_fileno(const kurl_t *ku);
+int kurl_error(const kurl_t *ku);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef KNETFILE_H
+#define KNETFILE_H
+typedef kurl_t knetFile;
+#define knet_open(fn, mode) kurl_open(fn, 0)
+#define knet_dopen(fd, mode) kurl_dopen(fd)
+#define knet_close(fp) kurl_close(fp)
+#define knet_read(fp, buf, len) kurl_read(fp, buf, len)
+#define knet_seek(fp, off, whence) kurl_seek(fp, off, whence)
+#define knet_tell(fp) kurl_tell(fp)
+#define knet_fileno(fp) kurl_fileno(fp)
+#define knet_win32_init() kurl_init()
+#define knet_win32_destroy() kurl_destroy()
+#endif
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/kvec.h b/web/server/h2o/libh2o/deps/klib/kvec.h
new file mode 100644
index 000000000..676be8b80
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/kvec.h
@@ -0,0 +1,90 @@
+/* The MIT License
+
+   Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "kvec.h"
+int main() {
+	kvec_t(int) array;
+	kv_init(array);
+	kv_push(int, array, 10); // append
+	kv_a(int, array, 20) = 5; // dynamic
+	kv_A(array, 20) = 4; // static
+	kv_destroy(array);
+	return 0;
+}
+*/
+
+/*
+  2008-09-22 (0.1.0):
+
+	* The initial version.
+
+*/
+
+#ifndef AC_KVEC_H
+#define AC_KVEC_H
+
+#include <stdlib.h>
+
+#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+
+#define kvec_t(type) struct { size_t n, m; type *a; }
+#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
+#define kv_destroy(v) free((v).a)
+#define kv_A(v, i) ((v).a[(i)])
+#define kv_pop(v) ((v).a[--(v).n])
+#define kv_size(v) ((v).n)
+#define kv_max(v) ((v).m)
+
+#define kv_resize(type, v, s)  ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
+
+#define kv_copy(type, v1, v0) do {							\
+		if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n);	\
+		(v1).n = (v0).n;									\
+		memcpy((v1).a, (v0).a, sizeof(type) * (v0).n);		\
+	} while (0)												\
+
+#define kv_push(type, v, x) do {									\
+		if ((v).n == (v).m) {										\
+			(v).m = (v).m? (v).m<<1 : 2;							\
+			(v).a = (type*)realloc((v).a, sizeof(type) * (v).m);	\
+		}															\
+		(v).a[(v).n++] = (x);										\
+	} while (0)
+
+#define kv_pushp(type, v) (((v).n == (v).m)?							\
+						   ((v).m = ((v).m? (v).m<<1 : 2),				\
+							(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0)	\
+						   : 0), ((v).a + ((v).n++))
+
+#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
+						  ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
+						   (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
+						  : (v).n <= (size_t)(i)? (v).n = (i) + 1 \
+						  : 0), (v).a[(i)])
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/lua/bio.lua b/web/server/h2o/libh2o/deps/klib/lua/bio.lua
new file mode 100644
index 000000000..c9f220059
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/lua/bio.lua
@@ -0,0 +1,149 @@
+-- bioinformatics routines
+
+-- Description: read a fasta/fastq file
+local function readseq(fp)
+	local finished, last = false, nil;
+	return function()
+		local match;
+		if finished then return nil end
+		if (last == nil) then -- the first record or a record following a fastq
+			for l in fp:lines() do
+				if l:byte(1) == 62 or l:byte(1) == 64 then -- ">" || "@"
+					last = l;
+					break;
+				end
+			end
+			if last == nil then
+				finished = true;
+				return nil;
+			end
+		end
+		local tmp = last:find("%s");
+		name = (tmp and last:sub(2, tmp-1)) or last:sub(2); -- sequence name
+		local seqs = {};
+		local c; -- the first character of the last line
+		last = nil;
+		for l in fp:lines() do -- read sequence
+			c = l:byte(1);
+			if c == 62 or c == 64 or c == 43 then
+				last = l;
+				break;
+			end
+			table.insert(seqs, l);
+		end
+		if last == nil then finished = true end -- end of file
+		if c ~= 43 then return name, table.concat(seqs) end -- a fasta record
+		local seq, len = table.concat(seqs), 0; -- prepare to parse quality
+		seqs = {};
+		for l in fp:lines() do -- read quality
+			table.insert(seqs, l);
+			len = len + #l;
+			if len >= #seq then
+				last = nil;
+				return name, seq, table.concat(seqs);
+			end
+		end
+		finished = true;
+		return name, seq;
+	end
+end
+
+-- extract subsequence from a fasta file indexe by samtools faidx
+local function faidxsub(fn)
+	local fpidx = io.open(fn .. ".fai");
+	if fpidx == nil then
+		io.stderr:write("[faidxsub] fail to open the FASTA index file.\n");
+		return nil
+	end
+	local idx = {};
+	for l in fpidx:lines() do
+		local name, len, offset, line_blen, line_len = l:match("(%S+)%s(%d+)%s(%d+)%s(%d+)%s(%d+)");
+		if name then
+			idx[name] = {tonumber(len), offset, line_blen, line_len};
+		end
+	end
+	fpidx:close();
+	local fp = io.open(fn);
+	return function(name, beg_, end_) -- 0-based coordinate
+		if name == nil then fp:close(); return nil; end
+		if idx[name] then
+			local a = idx[name];
+			beg_ = beg_ or 0;
+			end_ = end_ or a[1];
+			end_ = (end_ <= a[1] and end_) or a[1];
+			local fb, fe = math.floor(beg_ / a[3]), math.floor(end_ / a[3]);
+			local qb, qe = beg_ - fb * a[3], end_ - fe * a[3];
+			fp:seek("set", a[2] + fb * a[4] + qb);
+			local s = fp:read((fe - fb) * a[4] + (qe - qb)):gsub("%s", "");
+			return s;
+		end
+	end
+end
+
+--Description: Index a list of intervals and test if a given interval overlaps with the list
+--Example: lua -lbio -e 'a={{100,201},{200,300},{400,600}};f=bio.intvovlp(a);print(f(600,700))'
+--[[
+  By default, we keep for each tiling 8192 window the interval overlaping the
+  window while having the smallest start position. This method may not work
+  well when most intervals are small but few intervals span a long distance.
+]]--
+local function intvovlp(intv, bits)
+	bits = bits or 13 -- the default bin size is 8192 = 1<<13
+	table.sort(intv, function(a,b) return a[1] < b[1] end) -- sort by the start
+	-- merge intervals; the step speeds up testing, but can be skipped
+	local b, e, k = -1, -1, 1
+	for i = 1, #intv do
+		if e < intv[i][1] then
+			if e >= 0 then intv[k], k = {b, e}, k + 1 end
+			b, e = intv[i][1], intv[i][2]
+		else e = intv[i][2] end
+	end
+	if e >= 0 then intv[k] = {b, e} end
+	while #a > k do table.remove(a) end -- truncate the interval list
+	-- build the index for the list of intervals
+	local idx, size, max = {}, math.pow(2, bits), 0
+	for i = 1, #a do
+		b = math.modf(intv[i][1] / size)
+		e = math.modf(intv[i][2] / size)
+		if b == e then idx[b] = idx[b] or i
+		else for j = b, e do idx[j] = idx[j] or i end end
+		max = (max > e and max) or e
+	end
+	-- return a function (closure)
+	return function(_beg, _end)
+		local x = math.modf(_beg / size)
+		if x > max then return false end
+		local off = idx[x]; -- the start bin
+		if off == nil then -- the following is not the best in efficiency
+			for i = x - 1, 0, -1 do -- find the minimum bin with a value
+				if idx[i] ~= nil then off = idx[i]; break; end
+			end
+			if off == nil then return false end
+		end
+		for i = off, #intv do -- start from off and search for overlaps
+			if intv[i][1] >= _end then return false
+			elseif intv[i][2] > _beg then return true end 
+		end
+		return false
+	end
+end
+
+bio = {
+	readseq = readseq,
+	faidxsub = faidxsub,
+	intvovlp = intvovlp
+}
+
+bio.nt16 = {
+[0]=15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
+	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+}
+bio.ntcnt  = { [0]=4, 1, 1,  2, 1,  2, 2,  3, 1, 2, 2,  3, 2,  3, 3,  4 }
+bio.ntcomp = { [0]=0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }
+bio.ntrev  = 'XACMGRSVTWYHKDBN'
diff --git a/web/server/h2o/libh2o/deps/klib/lua/klib.lua b/web/server/h2o/libh2o/deps/klib/lua/klib.lua
new file mode 100644
index 000000000..bfe52f7f7
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/lua/klib.lua
@@ -0,0 +1,677 @@
+--[[
+  The MIT License
+  
+  Copyright (c) 2011, Attractive Chaos <attractor@live.co.uk>
+  
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+  
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+  
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+]]--
+
+--[[
+  This is a Lua library, more exactly a collection of Lua snippets, covering
+  utilities (e.g. getopt), string operations (e.g. split), statistics (e.g.
+  Fisher's exact test), special functions (e.g. logarithm gamma) and matrix
+  operations (e.g. Gauss-Jordan elimination). The routines are designed to be
+  as independent as possible, such that one can copy-paste relevant pieces of
+  code without worrying about additional library dependencies.
+
+  If you use routines from this library, please include the licensing
+  information above where appropriate.
+]]--
+
+--[[
+  Library functions and dependencies. "a>b" means "a is required by b"; "b<a"
+  means "b depends on a".
+
+  os.getopt()
+  string:split()
+  io.xopen()
+  table.ksmall()
+  table.shuffle()
+  math.lgamma() >math.lbinom() >math.igamma()
+  math.igamma() <math.lgamma() >matrix.chi2()
+  math.erfc()
+  math.lbinom() <math.lgamma() >math.fisher_exact()
+  math.bernstein_poly() <math.lbinom()
+  math.fisher_exact() <math.lbinom()
+  math.jackknife()
+  math.pearson()
+  math.spearman()
+  math.fmin()
+  matrix
+  matrix.add()
+  matrix.T() >matrix.mul()
+  matrix.mul() <matrix.T()
+  matrix.tostring()
+  matrix.chi2() <math.igamma()
+  matrix.solve()
+]]--
+
+-- Description: getopt() translated from the BSD getopt(); compatible with the default Unix getopt()
+--[[ Example:
+	for o, a in os.getopt(arg, 'a:b') do
+		print(o, a)
+	end
+]]--
+function os.getopt(args, ostr)
+	local arg, place = nil, 0;
+	return function ()
+		if place == 0 then -- update scanning pointer
+			place = 1
+			if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end
+			if #args[1] >= 2 then
+				place = place + 1
+				if args[1]:sub(2, 2) == '-' then -- found "--"
+					place = 0
+					table.remove(args, 1);
+					return nil;
+				end
+			end
+		end
+		local optopt = args[1]:sub(place, place);
+		place = place + 1;
+		local oli = ostr:find(optopt);
+		if optopt == ':' or oli == nil then -- unknown option
+			if optopt == '-' then return nil end
+			if place > #args[1] then
+				table.remove(args, 1);
+				place = 0;
+			end
+			return '?';
+		end
+		oli = oli + 1;
+		if ostr:sub(oli, oli) ~= ':' then -- do not need argument
+			arg = nil;
+			if place > #args[1] then
+				table.remove(args, 1);
+				place = 0;
+			end
+		else -- need an argument
+			if place <= #args[1] then  -- no white space
+				arg = args[1]:sub(place);
+			else
+				table.remove(args, 1);
+				if #args == 0 then -- an option requiring argument is the last one
+					place = 0;
+					if ostr:sub(1, 1) == ':' then return ':' end
+					return '?';
+				else arg = args[1] end
+			end
+			table.remove(args, 1);
+			place = 0;
+		end
+		return optopt, arg;
+	end
+end
+
+-- Description: string split
+function string:split(sep, n)
+	local a, start = {}, 1;
+	sep = sep or "%s+";
+	repeat
+		local b, e = self:find(sep, start);
+		if b == nil then
+			table.insert(a, self:sub(start));
+			break
+		end
+		a[#a+1] = self:sub(start, b - 1);
+		start = e + 1;
+		if n and #a == n then
+			table.insert(a, self:sub(start));
+			break
+		end
+	until start > #self;
+	return a;
+end
+
+-- Description: smart file open
+function io.xopen(fn, mode)
+	mode = mode or 'r';
+	if fn == nil then return io.stdin;
+	elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout;
+	elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w');
+	elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w');
+	else return io.open(fn, mode) end
+end
+
+-- Description: find the k-th smallest element in an array (Ref. http://ndevilla.free.fr/median/)
+function table.ksmall(arr, k)
+	local low, high = 1, #arr;
+	while true do
+		if high <= low then return arr[k] end
+		if high == low + 1 then
+			if arr[high] < arr[low] then arr[high], arr[low] = arr[low], arr[high] end;
+			return arr[k];
+		end
+		local mid = math.floor((high + low) / 2);
+		if arr[high] < arr[mid] then arr[mid], arr[high] = arr[high], arr[mid] end
+		if arr[high] < arr[low] then arr[low], arr[high] = arr[high], arr[low] end
+		if arr[low]  < arr[mid] then arr[low], arr[mid]  = arr[mid],  arr[low] end
+		arr[mid], arr[low+1] = arr[low+1], arr[mid];
+		local ll, hh = low + 1, high;
+		while true do
+			repeat ll = ll + 1 until arr[ll] >= arr[low]
+			repeat hh = hh - 1 until arr[low] >= arr[hh]
+			if hh < ll then break end
+			arr[ll], arr[hh] = arr[hh], arr[ll];
+		end
+		arr[low], arr[hh] = arr[hh], arr[low];
+		if hh <= k then low = ll end
+		if hh >= k then high = hh - 1 end
+	end
+end
+
+-- Description: shuffle/permutate an array
+function table.shuffle(a)
+	for i = #a, 1, -1 do
+		local j = math.random(i)
+		a[j], a[i] = a[i], a[j]
+	end
+end
+
+--
+-- Mathematics
+--
+
+-- Description: log gamma function
+-- Required by: math.lbinom()
+-- Reference: AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
+function math.lgamma(z)
+	local x;
+	x = 0.1659470187408462e-06     / (z+7);
+	x = x + 0.9934937113930748e-05 / (z+6);
+	x = x - 0.1385710331296526     / (z+5);
+	x = x + 12.50734324009056      / (z+4);
+	x = x - 176.6150291498386      / (z+3);
+	x = x + 771.3234287757674      / (z+2);
+	x = x - 1259.139216722289      / (z+1);
+	x = x + 676.5203681218835      / z;
+	x = x + 0.9999999999995183;
+	return math.log(x) - 5.58106146679532777 - z + (z-0.5) * math.log(z+6.5);
+end
+
+-- Description: regularized incomplete gamma function
+-- Dependent on: math.lgamma()
+--[[
+  Formulas are taken from Wiki, with additional input from Numerical
+  Recipes in C (for modified Lentz's algorithm) and AS245
+  (http://lib.stat.cmu.edu/apstat/245).
+ 
+  A good online calculator is available at:
+ 
+    http://www.danielsoper.com/statcalc/calc23.aspx
+ 
+  It calculates upper incomplete gamma function, which equals
+  math.igamma(s,z,true)*math.exp(math.lgamma(s))
+]]--
+function math.igamma(s, z, complement)
+
+	local function _kf_gammap(s, z)
+		local sum, x = 1, 1;
+		for k = 1, 100 do
+			x = x * z / (s + k);
+			sum = sum + x;
+			if x / sum < 1e-14 then break end
+		end
+		return math.exp(s * math.log(z) - z - math.lgamma(s + 1.) + math.log(sum));
+	end
+
+	local function _kf_gammaq(s, z)
+		local C, D, f, TINY;
+		f = 1. + z - s; C = f; D = 0.; TINY = 1e-290;
+		-- Modified Lentz's algorithm for computing continued fraction. See Numerical Recipes in C, 2nd edition, section 5.2
+		for j = 1, 100 do
+			local d;
+			local a, b = j * (s - j), j*2 + 1 + z - s;
+			D = b + a * D;
+			if D < TINY then D = TINY end
+			C = b + a / C;
+			if C < TINY then C = TINY end
+			D = 1. / D;
+			d = C * D;
+			f = f * d;
+			if math.abs(d - 1) < 1e-14 then break end
+		end
+		return math.exp(s * math.log(z) - z - math.lgamma(s) - math.log(f));
+	end
+
+	if complement then
+		return ((z <= 1 or z < s) and 1 - _kf_gammap(s, z)) or _kf_gammaq(s, z);
+	else 
+		return ((z <= 1 or z < s) and _kf_gammap(s, z)) or (1 - _kf_gammaq(s, z));
+	end
+end
+
+math.M_SQRT2   = 1.41421356237309504880  -- sqrt(2)
+math.M_SQRT1_2 = 0.70710678118654752440  -- 1/sqrt(2)
+
+-- Description: complement error function erfc(x): \Phi(x) = 0.5 * erfc(-x/M_SQRT2)
+function math.erfc(x)
+	local z = math.abs(x) * math.M_SQRT2
+	if z > 37 then return (x > 0 and 0) or 2 end
+	local expntl = math.exp(-0.5 * z * z)
+	local p
+	if z < 10. / math.M_SQRT2 then -- for small z
+	    p = expntl * ((((((.03526249659989109 * z + .7003830644436881) * z + 6.37396220353165) * z + 33.912866078383)
+				* z + 112.0792914978709) * z + 221.2135961699311) * z + 220.2068679123761)
+			/ (((((((.08838834764831844 * z + 1.755667163182642) * z + 16.06417757920695) * z + 86.78073220294608)
+				* z + 296.5642487796737) * z + 637.3336333788311) * z + 793.8265125199484) * z + 440.4137358247522);
+	else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65))))) end
+	return (x > 0 and 2 * p) or 2 * (1 - p)
+end
+
+-- Description: log binomial coefficient
+-- Dependent on: math.lgamma()
+-- Required by: math.fisher_exact()
+function math.lbinom(n, m)
+	if m == nil then
+		local a = {};
+		a[0], a[n] = 0, 0;
+		local t = math.lgamma(n+1);
+		for m = 1, n-1 do a[m] = t - math.lgamma(m+1) - math.lgamma(n-m+1) end
+		return a;
+	else return math.lgamma(n+1) - math.lgamma(m+1) - math.lgamma(n-m+1) end
+end
+
+-- Description: Berstein polynomials (mainly for Bezier curves)
+-- Dependent on: math.lbinom()
+-- Note: to compute derivative: let beta_new[i]=beta[i+1]-beta[i]
+function math.bernstein_poly(beta)
+	local n = #beta - 1;
+	local lbc = math.lbinom(n); -- log binomial coefficients
+	return function (t)
+		assert(t >= 0 and t <= 1);
+		if t == 0 then return beta[1] end
+		if t == 1 then return beta[n+1] end
+		local sum, logt, logt1 = 0, math.log(t), math.log(1-t);
+		for i = 0, n do sum = sum + beta[i+1] * math.exp(lbc[i] + i * logt + (n-i) * logt1) end
+		return sum;
+	end
+end
+
+-- Description: Fisher's exact test
+-- Dependent on: math.lbinom()
+-- Return: left-, right- and two-tail P-values
+--[[
+  Fisher's exact test for 2x2 congintency tables:
+
+    n11  n12  | n1_
+    n21  n22  | n2_
+   -----------+----
+    n_1  n_2  | n
+
+  Reference: http://www.langsrud.com/fisher.htm
+]]--
+function math.fisher_exact(n11, n12, n21, n22)
+	local aux; -- keep the states of n* for acceleration
+
+	-- Description: hypergeometric function
+	local function hypergeo(n11, n1_, n_1, n)
+		return math.exp(math.lbinom(n1_, n11) + math.lbinom(n-n1_, n_1-n11) - math.lbinom(n, n_1));
+	end
+
+	-- Description: incremental hypergeometric function
+	-- Note: aux = {n11, n1_, n_1, n, p}
+	local function hypergeo_inc(n11, n1_, n_1, n)
+		if n1_ ~= 0 or n_1 ~= 0 or n ~= 0 then
+			aux = {n11, n1_, n_1, n, 1};
+		else -- then only n11 is changed
+			local mod;
+			_, mod = math.modf(n11 / 11);
+			if mod ~= 0 and n11 + aux[4] - aux[2] - aux[3] ~= 0 then
+				if n11 == aux[1] + 1 then -- increase by 1
+					aux[5] = aux[5] * (aux[2] - aux[1]) / n11 * (aux[3] - aux[1]) / (n11 + aux[4] - aux[2] - aux[3]);
+					aux[1] = n11;
+					return aux[5];
+				end
+				if n11 == aux[1] - 1 then -- descrease by 1
+					aux[5] = aux[5] * aux[1] / (aux[2] - n11) * (aux[1] + aux[4] - aux[2] - aux[3]) / (aux[3] - n11);
+					aux[1] = n11;
+					return aux[5];
+				end
+			end
+			aux[1] = n11;
+		end
+		aux[5] = hypergeo(aux[1], aux[2], aux[3], aux[4]);
+		return aux[5];
+	end
+	
+	-- Description: computing the P-value by Fisher's exact test
+	local max, min, left, right, n1_, n_1, n, two, p, q, i, j;
+	n1_, n_1, n = n11 + n12, n11 + n21, n11 + n12 + n21 + n22;
+	max = (n_1 < n1_ and n_1) or n1_; -- max n11, for the right tail
+	min = n1_ + n_1 - n;
+	if min < 0 then min = 0 end -- min n11, for the left tail
+	two, left, right = 1, 1, 1;
+	if min == max then return 1 end -- no need to do test
+	q = hypergeo_inc(n11, n1_, n_1, n); -- the probability of the current table
+	-- left tail
+	i, left, p = min + 1, 0, hypergeo_inc(min, 0, 0, 0);
+	while p < 0.99999999 * q do
+		left, p, i = left + p, hypergeo_inc(i, 0, 0, 0), i + 1;
+	end
+	i = i - 1;
+	if p < 1.00000001 * q then left = left + p;
+	else i = i - 1 end
+	-- right tail
+	j, right, p = max - 1, 0, hypergeo_inc(max, 0, 0, 0);
+	while p < 0.99999999 * q do
+		right, p, j = right + p, hypergeo_inc(j, 0, 0, 0), j - 1;
+	end
+	j = j + 1;
+	if p < 1.00000001 * q then right = right + p;
+	else j = j + 1 end
+	-- two-tail
+	two = left + right;
+	if two > 1 then two = 1 end
+	-- adjust left and right
+	if math.abs(i - n11) < math.abs(j - n11) then right = 1 - left + q;
+	else left = 1 - right + q end
+	return left, right, two;
+end
+
+-- Description: Delete-m Jackknife
+--[[
+  Given g groups of values with a statistics estimated from m[i] samples in
+  i-th group being t[i], compute the mean and the variance. t0 below is the
+  estimate from all samples. Reference:
+
+     Busing et al. (1999) Delete-m Jackknife for unequal m. Statistics and Computing, 9:3-8.
+]]--
+function math.jackknife(g, m, t, t0)
+	local h, n, sum = {}, 0, 0;
+	for j = 1, g do n = n + m[j] end
+	if t0 == nil then -- When t0 is absent, estimate it in a naive way
+		t0 = 0;
+		for j = 1, g do t0 = t0 + m[j] * t[j] end
+		t0 = t0 / n;
+	end
+	local mean, var = 0, 0;
+	for j = 1, g do
+		h[j] = n / m[j];
+		mean = mean + (1 - m[j] / n) * t[j];
+	end
+	mean = g * t0 - mean; -- Eq. (8)
+	for j = 1, g do
+		local x = h[j] * t0 - (h[j] - 1) * t[j] - mean;
+		var = var + 1 / (h[j] - 1) * x * x;
+	end
+	var = var / g;
+	return mean, var;
+end
+
+-- Description: Pearson correlation coefficient
+-- Input: a is an n*2 table
+function math.pearson(a)
+	-- compute the mean
+	local x1, y1 = 0, 0
+	for _, v in pairs(a) do
+		x1, y1 = x1 + v[1], y1 + v[2]
+	end
+	-- compute the coefficient
+	x1, y1 = x1 / #a, y1 / #a
+	local x2, y2, xy = 0, 0, 0
+	for _, v in pairs(a) do
+		local tx, ty = v[1] - x1, v[2] - y1
+		xy, x2, y2 = xy + tx * ty, x2 + tx * tx, y2 + ty * ty
+	end
+	return xy / math.sqrt(x2) / math.sqrt(y2)
+end
+
+-- Description: Spearman correlation coefficient
+function math.spearman(a)
+	local function aux_func(t) -- auxiliary function
+		return (t == 1 and 0) or (t*t - 1) * t / 12
+	end
+
+	for _, v in pairs(a) do v.r = {} end
+	local T, S = {}, {}
+	-- compute the rank
+	for k = 1, 2 do
+		table.sort(a, function(u,v) return u[k]<v[k] end)
+		local same = 1
+		T[k] = 0
+		for i = 2, #a + 1 do
+			if i <= #a and a[i-1][k] == a[i][k] then same = same + 1
+			else
+				local rank = (i-1) * 2 - same + 1
+				for j = i - same, i - 1 do a[j].r[k] = rank end
+				if same > 1 then T[k], same = T[k] + aux_func(same), 1 end
+			end
+		end
+		S[k] = aux_func(#a) - T[k]
+	end
+	-- compute the coefficient
+	local sum = 0
+	for _, v in pairs(a) do -- TODO: use nested loops to reduce loss of precision
+		local t = (v.r[1] - v.r[2]) / 2
+		sum = sum + t * t
+	end
+	return (S[1] + S[2] - sum) / 2 / math.sqrt(S[1] * S[2])
+end
+
+-- Description: Hooke-Jeeves derivative-free optimization
+function math.fmin(func, x, data, r, eps, max_calls)
+	local n, n_calls = #x, 0;
+	r = r or 0.5;
+	eps = eps or 1e-7;
+	max_calls = max_calls or 50000
+
+	function fmin_aux(x1, data, fx1, dx) -- auxiliary function
+		local ftmp;
+		for k = 1, n do
+			x1[k] = x1[k] + dx[k];
+			local ftmp = func(x1, data); n_calls = n_calls + 1;
+			if ftmp < fx1 then fx1 = ftmp;
+			else -- search the opposite direction
+				dx[k] = -dx[k];
+				x1[k] = x1[k] + dx[k] + dx[k];
+				ftmp = func(x1, data); n_calls = n_calls + 1;
+				if ftmp < fx1 then fx1 = ftmp
+				else x1[k] = x1[k] - dx[k] end -- back to the original x[k]
+			end
+		end
+		return fx1; -- here: fx1=f(n,x1)
+	end
+
+	local dx, x1 = {}, {};
+	for k = 1, n do -- initial directions, based on MGJ
+		dx[k] = math.abs(x[k]) * r;
+		if dx[k] == 0 then dx[k] = r end;
+	end
+	local radius = r;
+	local fx1, fx;
+	fx = func(x, data); fx1 = fx; n_calls = n_calls + 1;
+	while true do
+		for i = 1, n do x1[i] = x[i] end; -- x1 = x
+		fx1 = fmin_aux(x1, data, fx, dx);
+		while fx1 < fx do
+			for k = 1, n do
+				local t = x[k];
+				dx[k] = (x1[k] > x[k] and math.abs(dx[k])) or -math.abs(dx[k]);
+				x[k] = x1[k];
+				x1[k] = x1[k] + x1[k] - t;
+			end
+			fx = fx1;
+			if n_calls >= max_calls then break end
+			fx1 = func(x1, data); n_calls = n_calls + 1;
+			fx1 = fmin_aux(x1, data, fx1, dx);
+			if fx1 >= fx then break end
+			local kk = n;
+			for k = 1, n do
+				if math.abs(x1[k] - x[k]) > .5 * math.abs(dx[k]) then
+					kk = k;
+					break;
+				end
+			end
+			if kk == n then break end
+		end
+		if radius >= eps then
+			if n_calls >= max_calls then break end
+			radius = radius * r;
+			for k = 1, n do dx[k] = dx[k] * r end
+		else break end
+	end
+	return fx1, n_calls;
+end
+
+--
+-- Matrix
+--
+
+matrix = {}
+
+-- Description: matrix transpose
+-- Required by: matrix.mul()
+function matrix.T(a)
+	local m, n, x = #a, #a[1], {};
+	for i = 1, n do
+		x[i] = {};
+		for j = 1, m do x[i][j] = a[j][i] end
+	end
+	return x;
+end
+
+-- Description: matrix add
+function matrix.add(a, b)
+	assert(#a == #b and #a[1] == #b[1]);
+	local m, n, x = #a, #a[1], {};
+	for i = 1, m do
+		x[i] = {};
+		local ai, bi, xi = a[i], b[i], x[i];
+		for j = 1, n do xi[j] = ai[j] + bi[j] end
+	end
+	return x;
+end
+
+-- Description: matrix mul
+-- Dependent on: matrix.T()
+-- Note: much slower without transpose
+function matrix.mul(a, b)
+	assert(#a[1] == #b);
+	local m, n, p, x = #a, #a[1], #b[1], {};
+	local c = matrix.T(b); -- transpose for efficiency
+	for i = 1, m do
+		x[i] = {}
+		local xi = x[i];
+		for j = 1, p do
+			local sum, ai, cj = 0, a[i], c[j];
+			for k = 1, n do sum = sum + ai[k] * cj[k] end
+			xi[j] = sum;
+		end
+	end
+	return x;
+end
+
+-- Description: matrix print
+function matrix.tostring(a)
+	local z = {};
+	for i = 1, #a do
+		z[i] = table.concat(a[i], "\t");
+	end
+	return table.concat(z, "\n");
+end
+
+-- Description: chi^2 test for contingency tables
+-- Dependent on: math.igamma()
+function matrix.chi2(a)
+	if #a == 2 and #a[1] == 2 then -- 2x2 table
+		local x, z
+		x = (a[1][1] + a[1][2]) * (a[2][1] + a[2][2]) * (a[1][1] + a[2][1]) * (a[1][2] + a[2][2])
+		if x == 0 then return 0, 1, false end
+		z = a[1][1] * a[2][2] - a[1][2] * a[2][1]
+		z = (a[1][1] + a[1][2] + a[2][1] + a[2][2]) * z * z / x
+		return z, math.igamma(.5, .5 * z, true), true
+	else -- generic table
+		local rs, cs, n, m, N, z = {}, {}, #a, #a[1], 0, 0
+		for i = 1, n do rs[i] = 0 end
+		for j = 1, m do cs[j] = 0 end
+		for i = 1, n do -- compute column sum and row sum
+			for j = 1, m do cs[j], rs[i] = cs[j] + a[i][j], rs[i] + a[i][j] end
+		end
+		for i = 1, n do N = N + rs[i] end
+		for i = 1, n do -- compute the chi^2 statistics
+			for j = 1, m do
+				local E = rs[i] * cs[j] / N;
+				z = z + (a[i][j] - E) * (a[i][j] - E) / E
+			end
+		end
+		return z, math.igamma(.5 * (n-1) * (m-1), .5 * z, true), true;
+	end
+end
+
+-- Description: Gauss-Jordan elimination (solving equations; computing inverse)
+-- Note: on return, a[n][n] is the inverse; b[n][m] is the solution
+-- Reference: Section 2.1, Numerical Recipes in C, 2nd edition
+function matrix.solve(a, b)
+	assert(#a == #a[1]);
+	local n, m = #a, (b and #b[1]) or 0;
+	local xc, xr, ipiv = {}, {}, {};
+	local ic, ir;
+
+	for j = 1, n do ipiv[j] = 0 end
+	for i = 1, n do
+		local big = 0;
+		for j = 1, n do
+			local aj = a[j];
+			if ipiv[j] ~= 1 then
+				for k = 1, n do
+					if ipiv[k] == 0 then
+						if math.abs(aj[k]) >= big then
+							big = math.abs(aj[k]);
+							ir, ic = j, k;
+						end
+					elseif ipiv[k] > 1 then return -2 end -- singular matrix
+				end
+			end
+		end
+		ipiv[ic] = ipiv[ic] + 1;
+		if ir ~= ic then
+			for l = 1, n do a[ir][l], a[ic][l] = a[ic][l], a[ir][l] end
+			if b then
+				for l = 1, m do b[ir][l], b[ic][l] = b[ic][l], b[ir][l] end
+			end
+		end
+		xr[i], xc[i] = ir, ic;
+		if a[ic][ic] == 0 then return -3 end -- singular matrix
+		local pivinv = 1 / a[ic][ic];
+		a[ic][ic] = 1;
+		for l = 1, n do a[ic][l] = a[ic][l] * pivinv end
+		if b then
+			for l = 1, n do b[ic][l] = b[ic][l] * pivinv end
+		end
+		for ll = 1, n do
+			if ll ~= ic then
+				local tmp = a[ll][ic];
+				a[ll][ic] = 0;
+				local all, aic = a[ll], a[ic];
+				for l = 1, n do all[l] = all[l] - aic[l] * tmp end
+				if b then
+					local bll, bic = b[ll], b[ic];
+					for l = 1, m do bll[l] = bll[l] - bic[l] * tmp end
+				end
+			end
+		end
+	end
+	for l = n, 1, -1 do
+		if xr[l] ~= xc[l] then
+			for k = 1, n do a[k][xr[l]], a[k][xc[l]] = a[k][xc[l]], a[k][xr[l]] end
+		end
+	end
+	return 0;
+end
diff --git a/web/server/h2o/libh2o/deps/klib/test/Makefile b/web/server/h2o/libh2o/deps/klib/test/Makefile
new file mode 100644
index 000000000..a392c8ed4
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/Makefile
@@ -0,0 +1,60 @@
+CC=gcc
+CXX=g++
+CFLAGS=-g -Wall -O2 -I..
+CXXFLAGS=$(CFLAGS)
+PROGS=kbtree_test khash_keith khash_keith2 khash_test klist_test kseq_test kseq_bench \
+		kseq_bench2 ksort_test ksort_test-stl kvec_test kmin_test kstring_bench kstring_bench2 kstring_test \
+		kthread_test
+
+all:$(PROGS)
+
+clean:
+		rm -fr $(PROGS) *.dSYM a.out
+
+kbtree_test:kbtree_test.c ../kbtree.h
+		$(CC) $(CFLAGS) -o $@ kbtree_test.c
+
+khash_keith:khash_keith.c ../khash.h
+		$(CC) $(CFLAGS) -o $@ khash_keith.c
+
+khash_keith2:khash_keith2.c ../khash.h
+		$(CC) $(CFLAGS) -o $@ khash_keith2.c
+
+khash_test:khash_test.c ../khash.h
+		$(CC) $(CFLAGS) -o $@ khash_test.c
+
+klist_test:klist_test.c ../klist.h
+		$(CC) $(CFLAGS) -o $@ klist_test.c
+
+kseq_test:kseq_test.c ../kseq.h
+		$(CC) $(CFLAGS) -o $@ kseq_test.c -lz
+
+kseq_bench:kseq_bench.c ../kseq.h
+		$(CC) $(CFLAGS) -o $@ kseq_bench.c -lz
+
+kseq_bench2:kseq_bench2.c ../kseq.h
+		$(CC) $(CFLAGS) -o $@ kseq_bench2.c -lz
+
+ksort_test:ksort_test.c ../ksort.h
+		$(CC) $(CFLAGS) -o $@ ksort_test.c
+
+ksort_test-stl:ksort_test.cc ../ksort.h
+		$(CXX) $(CXXFLAGS) -o $@ ksort_test.cc
+
+kvec_test:kvec_test.cc ../kvec.h
+		$(CXX) $(CXXFLAGS) -o $@ kvec_test.cc
+
+kmin_test:kmin_test.c ../kmath.h ../kmath.c
+		$(CC) $(CFLAGS) -o $@ kmin_test.c ../kmath.c
+
+kstring_bench:kstring_bench.c ../kstring.h ../kstring.c
+		$(CC) $(CFLAGS) -o $@ kstring_bench.c ../kstring.c
+
+kstring_bench2:kstring_bench2.c ../kstring.h ../kstring.c
+		$(CC) $(CFLAGS) -o $@ kstring_bench2.c ../kstring.c
+
+kstring_test:kstring_test.c ../kstring.h ../kstring.c
+		$(CC) $(CFLAGS) -o $@ kstring_test.c ../kstring.c
+
+kthread_test:kthread_test.c ../kthread.c
+		$(CC) $(CFLAGS) -fopenmp -o $@ kthread_test.c ../kthread.c
diff --git a/web/server/h2o/libh2o/deps/klib/test/kbit_test.c b/web/server/h2o/libh2o/deps/klib/test/kbit_test.c
new file mode 100644
index 000000000..3ae3bd309
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kbit_test.c
@@ -0,0 +1,137 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <emmintrin.h>
+#include "kbit.h"
+
+// from bowtie-0.9.8.1
+inline static int bt1_pop64(uint64_t x) // the kbi_popcount64() equivalence; similar to popcount_2() in wiki
+{
+   x -= ((x >> 1) & 0x5555555555555555llu);
+   x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
+   x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
+   x = x + (x >> 8);
+   x = x + (x >> 16);
+   x = x + (x >> 32);
+   return x & 0x3F;
+}
+
+inline static int bt1_countInU64(uint64_t dw, int c) // the kbi_DNAcount64() equivalence
+{
+	uint64_t dwA  = dw &  0xAAAAAAAAAAAAAAAAllu;
+	uint64_t dwNA = dw & ~0xAAAAAAAAAAAAAAAAllu;
+	uint64_t tmp;
+	switch (c) {
+	case 0: tmp = (dwA >> 1) | dwNA; break;
+	case 1: tmp = ~(dwA >> 1) & dwNA; break;
+	case 2: tmp = (dwA >> 1) & ~dwNA; break;
+	default: tmp = (dwA >> 1) & dwNA;
+	}
+	tmp = bt1_pop64(tmp);
+	if (c == 0) tmp = 32 - tmp;
+	return (int)tmp;
+}
+
+// from bigmagic
+static uint32_t sse2_bit_count32(const __m128i* block, const __m128i* block_end)
+{
+    const unsigned mu1 = 0x55555555;
+    const unsigned mu2 = 0x33333333;
+    const unsigned mu3 = 0x0F0F0F0F;
+    const unsigned mu4 = 0x0000003F;
+
+	uint32_t tcnt[4];
+
+    // Loading masks
+    __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
+    __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
+    __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
+    __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
+    __m128i mcnt;
+    mcnt = _mm_xor_si128(m1, m1); // cnt = 0
+
+    __m128i tmp1, tmp2;
+    do
+    {        
+        __m128i b = _mm_load_si128(block);
+        ++block;
+
+        // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
+        tmp1 = _mm_srli_epi32(b, 1);                    // tmp1 = (b >> 1 & 0x55555555)
+        tmp1 = _mm_and_si128(tmp1, m1); 
+        tmp2 = _mm_and_si128(b, m1);                    // tmp2 = (b & 0x55555555)
+        b    = _mm_add_epi32(tmp1, tmp2);               //  b = tmp1 + tmp2
+
+        // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
+        tmp1 = _mm_srli_epi32(b, 2);                    // (b >> 2 & 0x33333333)
+        tmp1 = _mm_and_si128(tmp1, m2); 
+        tmp2 = _mm_and_si128(b, m2);                    // (b & 0x33333333)
+        b    = _mm_add_epi32(tmp1, tmp2);               // b = tmp1 + tmp2
+
+        // b = (b + (b >> 4)) & 0x0F0F0F0F;
+        tmp1 = _mm_srli_epi32(b, 4);                    // tmp1 = b >> 4
+        b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 4)
+        b = _mm_and_si128(b, m3);                       //           & 0x0F0F0F0F
+
+        // b = b + (b >> 8);
+        tmp1 = _mm_srli_epi32 (b, 8);                   // tmp1 = b >> 8
+        b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 8)
+
+        // b = (b + (b >> 16)) & 0x0000003F;
+        tmp1 = _mm_srli_epi32 (b, 16);                  // b >> 16
+        b = _mm_add_epi32(b, tmp1);                     // b + (b >> 16)
+        b = _mm_and_si128(b, m4);                       // (b >> 16) & 0x0000003F;
+
+        mcnt = _mm_add_epi32(mcnt, b);                  // mcnt += b
+
+    } while (block < block_end);
+
+    _mm_store_si128((__m128i*)tcnt, mcnt);
+
+    return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
+}
+
+int main(void)
+{
+	int i, N = 100000000;
+	uint64_t *x, cnt;
+	clock_t t;
+	int c = 1;
+
+	x = (uint64_t*)calloc(N, 8);
+	srand48(11);
+	for (i = 0; i < N; ++i)
+		x[i] = (uint64_t)lrand48() << 32 ^ lrand48();
+
+	fprintf(stderr, "\n===> Calculate # of 1 in an integer (popcount) <===\n");
+
+	t = clock(); cnt = 0;
+	for (i = 0; i < N; ++i) cnt += kbi_popcount64(x[i]);
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "kbit", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	t = clock(); cnt = 0;
+	for (i = 0; i < N; ++i) cnt += bt1_pop64(x[i]);
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "wiki-popcount_2", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	t = clock(); cnt = 0;
+	for (i = 0; i < N; ++i) cnt += __builtin_popcountl(x[i]);
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "__builtin_popcountl", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	t = clock(); cnt = 0;
+	cnt += sse2_bit_count32((__m128i*)x, (__m128i*)(x+N));
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "SSE2-32bit", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	fprintf(stderr, "\n===> Count '%c' in 2-bit encoded integers <===\n", "ACGT"[c]);
+
+	t = clock(); cnt = 0;
+	for (i = 0; i < N; ++i) cnt += kbi_DNAcount64(x[i], c);
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "kbit", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	t = clock(); cnt = 0;
+	for (i = 0; i < N; ++i) cnt += bt1_countInU64(x[i], c);
+	fprintf(stderr, "%20s\t%20ld\t%10.6f\n", "bowtie1", (long)cnt, (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	fprintf(stderr, "\n");
+	free(x);
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kbtree_test.c b/web/server/h2o/libh2o/deps/klib/test/kbtree_test.c
new file mode 100644
index 000000000..8e1068767
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kbtree_test.c
@@ -0,0 +1,94 @@
+#include <stdio.h>
+#include <assert.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+typedef const char *str_t;
+
+#include "kbtree.h"
+KBTREE_INIT(int, uint32_t, kb_generic_cmp)
+KBTREE_INIT(str, str_t, kb_str_cmp)
+
+static int data_size = 5000000;
+static unsigned *int_data;
+static char **str_data;
+
+void ht_init_data()
+{
+	int i;
+	char buf[256];
+	printf("--- generating data... ");
+	srand48(11);
+	int_data = (unsigned*)calloc(data_size, sizeof(unsigned));
+	str_data = (char**)calloc(data_size, sizeof(char*));
+	for (i = 0; i < data_size; ++i) {
+		int_data[i] = (unsigned)(data_size * drand48() / 4) * 271828183u;
+		sprintf(buf, "%x", int_data[i]);
+		str_data[i] = strdup(buf);
+	}
+	printf("done!\n");
+}
+void ht_destroy_data()
+{
+	int i;
+	for (i = 0; i < data_size; ++i) free(str_data[i]);
+	free(str_data); free(int_data);
+}
+
+void ht_khash_int()
+{
+	int i;
+	unsigned *data = int_data;
+	uint32_t *l, *u;
+	kbtree_t(int) *h;
+
+	h = kb_init(int, KB_DEFAULT_SIZE);
+	for (i = 0; i < data_size; ++i) {
+		if (kb_get(int, h, data[i]) == 0) kb_put(int, h, data[i]);
+		else kb_del(int, h, data[i]);
+	}
+	printf("[ht_khash_int] size: %d\n", kb_size(h));
+	if (1) {
+		int cnt = 0;
+		uint32_t x, y;
+		kb_interval(int, h, 2174625464u, &l, &u);
+		printf("interval for 2174625464: (%u, %u)\n", l? *l : 0, u? *u : 0);
+#define traverse_f(p) { if (cnt == 0) y = *p; ++cnt; }
+		__kb_traverse(uint32_t, h, traverse_f);
+		__kb_get_first(uint32_t, h, x);
+		printf("# of elements from traversal: %d\n", cnt);
+		printf("first element: %d == %d\n", x, y);
+	}
+	__kb_destroy(h);
+}
+void ht_khash_str()
+{
+	int i;
+	char **data = str_data;
+	kbtree_t(str) *h;
+
+	h = kb_init(str, KB_DEFAULT_SIZE);
+	for (i = 0; i < data_size; ++i) {
+		if (kb_get(str, h, data[i]) == 0) kb_put(str, h, data[i]);
+		else kb_del(str, h, data[i]);
+	}
+	printf("[ht_khash_int] size: %d\n", kb_size(h));
+	__kb_destroy(h);
+}
+void ht_timing(void (*f)(void))
+{
+	clock_t t = clock();
+	(*f)();
+	printf("[ht_timing] %.3lf sec\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+}
+int main(int argc, char *argv[])
+{
+	if (argc > 1) data_size = atoi(argv[1]);
+	ht_init_data();
+	ht_timing(ht_khash_int);
+	ht_timing(ht_khash_str);
+	ht_destroy_data();
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kgraph_test.c b/web/server/h2o/libh2o/deps/klib/test/kgraph_test.c
new file mode 100644
index 000000000..3da1cd71b
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kgraph_test.c
@@ -0,0 +1,26 @@
+#include <stdio.h>
+#include "kgraph.h"
+
+KHASH_INIT2(e32, extern, uint32_t, int, 1, kh_int_hash_func, kh_int_hash_equal)
+
+typedef struct {
+	int i;
+	khash_t(e32) *_arc;
+} vertex_t;
+
+KGRAPH_INIT(g, extern, vertex_t, int, e32)
+KGRAPH_PRINT(g, extern)
+
+int main()
+{
+	int *pb, *pe;
+	kgraph_t(g) *g;
+	g = kg_init_g();
+	kg_put_a_g(g, 10, 20, 0, &pb, &pe);
+	kg_put_a_g(g, 20, 30, 0, &pb, &pe);
+	kg_put_a_g(g, 30, 10, 1, &pb, &pe);
+	kg_del_v_g(g, 20);
+	kg_print_g(g);
+	kg_destroy_g(g);
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/khash_keith.c b/web/server/h2o/libh2o/deps/klib/test/khash_keith.c
new file mode 100644
index 000000000..ddd755ac7
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/khash_keith.c
@@ -0,0 +1,95 @@
+/*
+ * This is an optimized version of the following C++ program:
+ *
+ *   http://keithlea.com/javabench/src/cpp/hash.cpp
+ *
+ * Keith in his benchmark (http://keithlea.com/javabench/data) showed that the
+ * Java implementation is twice as fast as the C++ version. In fact, this is
+ * only because the C++ implementation is substandard. Most importantly, Keith
+ * is using "sprintf()" to convert an integer to a string, which is known to be
+ * extremely inefficient.
+ */
+#include <stdio.h>
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, int)
+
+inline void int2str(int c, int base, char *ret)
+{
+	const char *tab = "0123456789abcdef";
+	if (c == 0) ret[0] = '0', ret[1] = 0;
+	else {
+		int l, x, y;
+		char buf[16];
+		for (l = 0, x = c < 0? -c : c; x > 0; x /= base) buf[l++] = tab[x%base];
+		if (c < 0) buf[l++] = '-';
+		for (x = l - 1, y = 0; x >= 0; --x) ret[y++] = buf[x];
+		ret[y] = 0;
+	}
+}
+
+#ifndef _USE_STRDUP
+#define BLOCK_SIZE 0x100000
+int main(int argc, char *argv[])
+{
+	char **mem = 0;
+	int i, l, n = 1000000, ret, block_end = 0, curr = 0, c = 0;
+	khash_t(str) *h;
+	h = kh_init(str);
+	if (argc > 1) n = atoi(argv[1]);
+	mem = malloc(sizeof(void*));
+	mem[0] = malloc(BLOCK_SIZE); // memory buffer to avoid memory fragmentation
+	curr = block_end = 0;
+	for (i = 1; i <= n; ++i) {
+		char buf[16];
+		int2str(i, 16, buf);
+		khint_t k = kh_put(str, h, buf, &ret);
+		l = strlen(buf) + 1;
+		if (block_end + l > BLOCK_SIZE) {
+			++curr; block_end = 0;
+			mem = realloc(mem, (curr + 1) * sizeof(void*));
+			mem[curr] = malloc(BLOCK_SIZE);
+		}
+		memcpy(mem[curr] + block_end, buf, l);
+		kh_key(h, k) = mem[curr] + block_end;
+		block_end += l;
+		kh_val(h, k) = i;
+	}
+	for (i = 1; i <= n; ++i) {
+		char buf[16];
+		int2str(i, 10, buf);
+		khint_t k = kh_get(str, h, buf);
+		if (k != kh_end(h)) ++c;
+	}
+	printf("%d\n", c);
+	for (ret = 0; ret <= curr; ++ret) free(mem[ret]);
+	free(mem);
+	kh_destroy(str, h);
+	return 0;
+}
+#else // _USE_STRDUP
+int main(int argc, char *argv[])
+{
+	int i, l, n = 1000000, ret, c = 0;
+	khash_t(str) *h;
+	khint_t k;
+	h = kh_init(str);
+	if (argc > 1) n = atoi(argv[1]);
+	for (i = 1; i <= n; ++i) {
+		char buf[16];
+		int2str(i, 16, buf);
+		k = kh_put(str, h, strdup(buf), &ret);
+		kh_val(h, k) = i;
+	}
+	for (i = 1; i <= n; ++i) {
+		char buf[16];
+		int2str(i, 10, buf);
+		k = kh_get(str, h, buf);
+		if (k != kh_end(h)) ++c;
+	}
+	for (k = kh_begin(h); k != kh_end(h); ++k) // explicitly freeing memory takes 10-20% CPU time.
+		if (kh_exist(h, k)) free((char*)kh_key(h, k));
+	printf("%d\n", c);
+	kh_destroy(str, h);
+	return 0;
+}
+#endif
diff --git a/web/server/h2o/libh2o/deps/klib/test/khash_keith2.c b/web/server/h2o/libh2o/deps/klib/test/khash_keith2.c
new file mode 100644
index 000000000..b9df9b7c1
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/khash_keith2.c
@@ -0,0 +1,67 @@
+/*
+ * This is an optimized version of the following C++ program:
+ *
+ *   http://keithlea.com/javabench/src/cpp/hash.cpp
+ *
+ * Keith in his benchmark (http://keithlea.com/javabench/data) showed that the
+ * Java implementation is twice as fast as the C++ version. In fact, this is
+ * only because the C++ implementation is substandard. Most importantly, Keith
+ * is using "sprintf()" to convert an integer to a string, which is known to be
+ * extremely inefficient.
+ */
+#include <stdio.h>
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, int)
+
+inline void int2str(int c, int base, char *ret)
+{
+	const char *tab = "0123456789abcdef";
+	if (c == 0) ret[0] = '0', ret[1] = 0;
+	else {
+		int l, x, y;
+		char buf[16];
+		for (l = 0, x = c < 0? -c : c; x > 0; x /= base) buf[l++] = tab[x%base];
+		if (c < 0) buf[l++] = '-';
+		for (x = l - 1, y = 0; x >= 0; --x) ret[y++] = buf[x];
+		ret[y] = 0;
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	int i, l, n = 1000, ret;
+	khash_t(str) *h, *h2;
+	khint_t k;
+	h = kh_init(str);
+	h2 = kh_init(str);
+	if (argc > 1) n = atoi(argv[1]);
+	for (i = 0; i < 10000; ++i) {
+		char buf[32];
+		strcpy(buf, "foo_");
+		int2str(i, 10, buf+4);
+		k = kh_put(str, h, strdup(buf), &ret);
+		kh_val(h, k) = i;
+	}
+	for (i = 0; i < n; ++i) {
+		for (k = kh_begin(h); k != kh_end(h); ++k) {
+			if (kh_exist(h, k)) {
+				khint_t k2 = kh_put(str, h2, kh_key(h, k), &ret);
+				if (ret) { // absent
+					kh_key(h2, k2) = strdup(kh_key(h, k));
+					kh_val(h2, k2) = kh_val(h, k);
+				} else kh_val(h2, k2) += kh_val(h, k);
+			}
+		}
+	}
+	k = kh_get(str, h, "foo_1"); printf("%d", kh_val(h, k));
+	k = kh_get(str, h, "foo_9999"); printf(" %d", kh_val(h, k));
+	k = kh_get(str, h2, "foo_1"); printf(" %d", kh_val(h2, k));
+	k = kh_get(str, h2, "foo_9999"); printf(" %d\n", kh_val(h2, k));
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) free((char*)kh_key(h, k));
+	for (k = kh_begin(h2); k != kh_end(h2); ++k)
+		if (kh_exist(h2, k)) free((char*)kh_key(h2, k));
+	kh_destroy(str, h);
+	kh_destroy(str, h2);
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/khash_test.c b/web/server/h2o/libh2o/deps/klib/test/khash_test.c
new file mode 100644
index 000000000..8d6687ff4
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/khash_test.c
@@ -0,0 +1,141 @@
+#include <stdio.h>
+#include <assert.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "khash.h"
+KHASH_SET_INIT_STR(str)
+KHASH_MAP_INIT_INT(int, unsigned char)
+
+typedef struct {
+	unsigned key;
+	unsigned char val;
+} int_unpack_t;
+
+typedef struct {
+	unsigned key;
+	unsigned char val;
+} __attribute__ ((__packed__)) int_packed_t;
+
+#define hash_eq(a, b) ((a).key == (b).key)
+#define hash_func(a) ((a).key)
+
+KHASH_INIT(iun, int_unpack_t, char, 0, hash_func, hash_eq)
+KHASH_INIT(ipk, int_packed_t, char, 0, hash_func, hash_eq)
+
+static int data_size = 5000000;
+static unsigned *int_data;
+static char **str_data;
+
+void ht_init_data()
+{
+	int i;
+	char buf[256];
+	khint32_t x = 11;
+	printf("--- generating data... ");
+	int_data = (unsigned*)calloc(data_size, sizeof(unsigned));
+	str_data = (char**)calloc(data_size, sizeof(char*));
+	for (i = 0; i < data_size; ++i) {
+		int_data[i] = (unsigned)(data_size * ((double)x / UINT_MAX) / 4) * 271828183u;
+		sprintf(buf, "%x", int_data[i]);
+		str_data[i] = strdup(buf);
+		x = 1664525L * x + 1013904223L;
+	}
+	printf("done!\n");
+}
+
+void ht_destroy_data()
+{
+	int i;
+	for (i = 0; i < data_size; ++i) free(str_data[i]);
+	free(str_data); free(int_data);
+}
+
+void ht_khash_int()
+{
+	int i, ret;
+	unsigned *data = int_data;
+	khash_t(int) *h;
+	unsigned k;
+
+	h = kh_init(int);
+	for (i = 0; i < data_size; ++i) {
+		k = kh_put(int, h, data[i], &ret);
+		kh_val(h, k) = i&0xff;
+		if (!ret) kh_del(int, h, k);
+	}
+	printf("[ht_khash_int] size: %u\n", kh_size(h));
+	kh_destroy(int, h);
+}
+
+void ht_khash_str()
+{
+	int i, ret;
+	char **data = str_data;
+	khash_t(str) *h;
+	unsigned k;
+
+	h = kh_init(str);
+	for (i = 0; i < data_size; ++i) {
+		k = kh_put(str, h, data[i], &ret);
+		if (!ret) kh_del(str, h, k);
+	}
+	printf("[ht_khash_int] size: %u\n", kh_size(h));
+	kh_destroy(str, h);
+}
+
+void ht_khash_unpack()
+{
+	int i, ret;
+	unsigned *data = int_data;
+	khash_t(iun) *h;
+	unsigned k;
+
+	h = kh_init(iun);
+	for (i = 0; i < data_size; ++i) {
+		int_unpack_t x;
+		x.key = data[i]; x.val = i&0xff;
+		k = kh_put(iun, h, x, &ret);
+		if (!ret) kh_del(iun, h, k);
+	}
+	printf("[ht_khash_unpack] size: %u (sizeof=%ld)\n", kh_size(h), sizeof(int_unpack_t));
+	kh_destroy(iun, h);
+}
+
+void ht_khash_packed()
+{
+	int i, ret;
+	unsigned *data = int_data;
+	khash_t(ipk) *h;
+	unsigned k;
+
+	h = kh_init(ipk);
+	for (i = 0; i < data_size; ++i) {
+		int_packed_t x;
+		x.key = data[i]; x.val = i&0xff;
+		k = kh_put(ipk, h, x, &ret);
+		if (!ret) kh_del(ipk, h, k);
+	}
+	printf("[ht_khash_packed] size: %u (sizeof=%ld)\n", kh_size(h), sizeof(int_packed_t));
+	kh_destroy(ipk, h);
+}
+
+void ht_timing(void (*f)(void))
+{
+	clock_t t = clock();
+	(*f)();
+	printf("[ht_timing] %.3lf sec\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc > 1) data_size = atoi(argv[1]);
+	ht_init_data();
+	ht_timing(ht_khash_int);
+	ht_timing(ht_khash_str);
+	ht_timing(ht_khash_unpack);
+	ht_timing(ht_khash_packed);
+	ht_destroy_data();
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/klist_test.c b/web/server/h2o/libh2o/deps/klib/test/klist_test.c
new file mode 100644
index 000000000..cd13813df
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/klist_test.c
@@ -0,0 +1,19 @@
+#include <stdio.h>
+#include "klist.h"
+
+#define __int_free(x)
+KLIST_INIT(32, int, __int_free)
+
+int main()
+{
+	klist_t(32) *kl;
+	kliter_t(32) *p;
+	kl = kl_init(32);
+	*kl_pushp(32, kl) = 1;
+	*kl_pushp(32, kl) = 10;
+	kl_shift(32, kl, 0);
+	for (p = kl_begin(kl); p != kl_end(kl); p = kl_next(p))
+		printf("%d\n", kl_val(p));
+	kl_destroy(32, kl);
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kmin_test.c b/web/server/h2o/libh2o/deps/klib/test/kmin_test.c
new file mode 100644
index 000000000..33ccd1cbc
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kmin_test.c
@@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <math.h>
+#include "kmath.h"
+
+static int n_evals;
+
+double f_Chebyquad(int n, double *x, void *data)
+{
+    int i, j;
+    double y[20][20], f;
+    int np, iw;
+    double sum;
+    for (j = 0; j != n; ++j) {
+		y[0][j] = 1.;
+		y[1][j] = 2. * x[j] - 1.;
+    }
+    for (i = 1; i != n; ++i)
+		for (j = 0; j != n; ++j)
+			y[i+1][j] = 2. * y[1][j] * y[i][j] - y[i-1][j];
+    f = 0.;
+    np = n + 1;
+    iw = 1;
+    for (i = 0; i != np; ++i) {
+		sum = 0.;
+		for (j = 0; j != n; ++j) sum += y[i][j];
+		sum /= n;
+		if (iw > 0) sum += 1. / ((i - 1) * (i + 1));
+		iw = -iw;
+		f += sum * sum;
+    }
+	++n_evals;
+    return f;
+}
+
+int main()
+{
+	double x[20], y;
+	int n, i;
+	printf("\nMinimizer: Hooke-Jeeves\n");
+	for (n = 2; n <= 8; n += 2) {
+		for (i = 0; i != n; ++i) x[i] = (double)(i + 1) / n;
+		n_evals = 0;
+		y = kmin_hj(f_Chebyquad, n, x, 0, KMIN_RADIUS, KMIN_EPS, KMIN_MAXCALL);
+		printf("n=%d,min=%.8lg,n_evals=%d\n", n, y, n_evals);
+	}
+	printf("\n");
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kseq_bench.c b/web/server/h2o/libh2o/deps/klib/test/kseq_bench.c
new file mode 100644
index 000000000..eeda13f71
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kseq_bench.c
@@ -0,0 +1,69 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "kseq.h"
+
+#define BUF_SIZE 4096
+KSTREAM_INIT(gzFile, gzread, BUF_SIZE)
+
+int main(int argc, char *argv[])
+{
+	gzFile fp;
+	clock_t t;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: kseq_bench <in.gz>\n");
+		return 1;
+	}
+	{
+		uint8_t *buf = malloc(BUF_SIZE);
+		fp = gzopen(argv[1], "r");
+		t = clock();
+		while (gzread(fp, buf, BUF_SIZE) > 0);
+		fprintf(stderr, "[gzread] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		gzclose(fp);
+		free(buf);
+	}
+	{
+		kstream_t *ks;
+		fp = gzopen(argv[1], "r");
+		ks = ks_init(fp);
+		t = clock();
+		while (ks_getc(ks) >= 0);
+		fprintf(stderr, "[ks_getc] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		ks_destroy(ks);
+		gzclose(fp);
+	}
+	{
+		kstream_t *ks;
+		kstring_t *s;
+		int dret;
+		s = calloc(1, sizeof(kstring_t));
+		fp = gzopen(argv[1], "r");
+		ks = ks_init(fp);
+		t = clock();
+		while (ks_getuntil(ks, '\n', s, &dret) >= 0);
+		fprintf(stderr, "[ks_getuntil] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		ks_destroy(ks);
+		gzclose(fp);
+		free(s->s); free(s);
+	}
+	if (argc == 2) {
+		fp = gzopen(argv[1], "r");
+		t = clock();
+		while (gzgetc(fp) >= 0);
+		fprintf(stderr, "[gzgetc] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		gzclose(fp);
+	}
+	if (argc == 2) {
+		char *buf = malloc(BUF_SIZE);
+		fp = gzopen(argv[1], "r");
+		t = clock();
+		while (gzgets(fp, buf, BUF_SIZE) > 0);
+		fprintf(stderr, "[gzgets] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+		gzclose(fp);
+		free(buf);
+	}
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kseq_bench2.c b/web/server/h2o/libh2o/deps/klib/test/kseq_bench2.c
new file mode 100644
index 000000000..b4154583b
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kseq_bench2.c
@@ -0,0 +1,43 @@
+#include <stdio.h>
+#include <time.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include "kseq.h"
+KSTREAM_INIT(int, read, 4096)
+
+#define BUF_SIZE 65536
+
+int main(int argc, char *argv[])
+{
+	clock_t t;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: %s <in.txt>\n", argv[0]);
+		return 1;
+	}
+	{
+		FILE *fp;
+		char *s;
+		t = clock();
+		s = malloc(BUF_SIZE);
+		fp = fopen(argv[1], "r");
+		while (fgets(s, BUF_SIZE, fp));
+		fclose(fp);
+		fprintf(stderr, "[fgets] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	{
+		int fd, dret;
+		kstream_t *ks;
+		kstring_t s;
+		t = clock();
+		s.l = s.m = 0; s.s = 0;
+		fd = open(argv[1], O_RDONLY);
+		ks = ks_init(fd);
+		while (ks_getuntil(ks, '\n', &s, &dret) >= 0);
+		free(s.s);
+		ks_destroy(ks);
+		close(fd);
+		fprintf(stderr, "[kstream] %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kseq_test.c b/web/server/h2o/libh2o/deps/klib/test/kseq_test.c
new file mode 100644
index 000000000..0304dea35
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kseq_test.c
@@ -0,0 +1,27 @@
+#include <zlib.h>
+#include <stdio.h>
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+int main(int argc, char *argv[])
+{
+	gzFile fp;
+	kseq_t *seq;
+	int l;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: %s <in.fasta>\n", argv[0]);
+		return 1;
+	}
+	fp = gzopen(argv[1], "r");
+	seq = kseq_init(fp);
+	while ((l = kseq_read(seq)) >= 0) {
+		printf("name: %s\n", seq->name.s);
+		if (seq->comment.l) printf("comment: %s\n", seq->comment.s);
+		printf("seq: %s\n", seq->seq.s);
+		if (seq->qual.l) printf("qual: %s\n", seq->qual.s);
+	}
+	printf("return value: %d\n", l);
+	kseq_destroy(seq);
+	gzclose(fp);
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kseq_test.dat b/web/server/h2o/libh2o/deps/klib/test/kseq_test.dat
new file mode 100644
index 000000000..b774ae289
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kseq_test.dat
@@ -0,0 +1,12 @@
+>1
+acgtacgtacgtagc
+>2 test
+acgatcgatc
+@3 test2
+cgctagcatagc
+cgatatgactta
++
+78wo82usd980
+d88fau
+
+238ud8
diff --git a/web/server/h2o/libh2o/deps/klib/test/ksort_test.c b/web/server/h2o/libh2o/deps/klib/test/ksort_test.c
new file mode 100644
index 000000000..92c7d3d16
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/ksort_test.c
@@ -0,0 +1,104 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include "ksort.h"
+
+KSORT_INIT_GENERIC(int)
+
+int main(int argc, char *argv[])
+{
+	int i, N = 10000000;
+	int *array, x;
+	clock_t t1, t2;
+	if (argc > 1) N = atoi(argv[1]);
+	array = (int*)malloc(sizeof(int) * N);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	x = ks_ksmall(int, N, array, 10500);
+	t2 = clock();
+	fprintf(stderr, "ksmall [%d]: %.3lf\n", x, (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_introsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "introsort [%d]: %.3lf\n", array[10500], (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in introsort!\n");
+			exit(1);
+		}
+	}
+
+#ifndef _ALIGNED_ONLY
+	{ // test unaligned ksmall
+		srand48(11);
+		unsigned char *a;
+		int *b;
+		a = malloc(N * sizeof(int) + 1);
+		b = (int*)(a + 1);
+		for (i = 0; i < N; ++i) b[i] = (int)lrand48();
+		t1 = clock();
+		ks_introsort(int, N, b);
+		t2 = clock();
+		fprintf(stderr, "introsort [%d]: %.3lf (unaligned: 0x%lx) \n", b[10500], (double)(t2-t1)/CLOCKS_PER_SEC, (size_t)b);
+	}
+#endif
+
+	t1 = clock();
+	ks_introsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "introsort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_combsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "combsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in combsort!\n");
+			exit(1);
+		}
+	}
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_mergesort(int, N, array, 0);
+	t2 = clock();
+	fprintf(stderr, "mergesort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in mergesort!\n");
+			exit(1);
+		}
+	}
+
+	t1 = clock();
+	ks_mergesort(int, N, array, 0);
+	t2 = clock();
+	fprintf(stderr, "mergesort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_heapmake(int, N, array);
+	ks_heapsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "heapsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in heapsort!\n");
+			exit(1);
+		}
+	}
+
+	free(array);
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/ksort_test.cc b/web/server/h2o/libh2o/deps/klib/test/ksort_test.cc
new file mode 100644
index 000000000..8950d8064
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/ksort_test.cc
@@ -0,0 +1,997 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include <algorithm>
+
+#include "ksort.h"
+KSORT_INIT_GENERIC(int)
+
+using namespace std;
+
+/**********************************
+ * BEGIN OF PAUL'S IMPLEMENTATION *
+ **********************************/
+
+/* Attractive Chaos: I have added inline where necessary. */
+
+/*
+Copyright (c) 2004 Paul Hsieh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+    Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+    Neither the name of sorttest nor the names of its contributors may be
+    used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+
+Recommended flags:
+------------------
+
+Intel C/C++:
+icl /O2 /G6 /Qaxi /Qxi /Qip sorttest.c
+
+WATCOM C/C++:
+wcl386 /otexan /6r sorttest.c
+
+GCC:
+gcc -O3 -mcpu=athlon-xp -march=athlon-xp sorttest.c
+
+MSVC:
+cl /O2 /Ot /Og /G6 sorttest.c
+
+*/
+
+static inline void sort2 (int * numbers) {
+int tmp;
+
+    if (numbers[0] <= numbers[1]) return;
+    tmp = numbers[0];
+    numbers[0] = numbers[1];
+    numbers[1] = tmp;
+}
+
+static inline void sort3 (int * numbers) {
+int tmp;
+
+    if (numbers[0] <= numbers[1]) {
+        if (numbers[1] <= numbers[2]) return;
+        if (numbers[2] <= numbers[0]) {
+            tmp = numbers[0];
+            numbers[0] = numbers[2];
+            numbers[2] = numbers[1];
+            numbers[1] = tmp;
+            return;
+        }
+        tmp = numbers[1];
+    } else {
+        tmp = numbers[0];
+        if (numbers[0] <= numbers[2]) {
+            numbers[0] = numbers[1];
+            numbers[1] = tmp;
+            return;
+        }
+        if (numbers[2] <= numbers[1]) {
+            numbers[0] = numbers[2];
+            numbers[2] = tmp;
+            return;
+        }
+        numbers[0] = numbers[1];
+    }
+    numbers[1] = numbers[2];
+    numbers[2] = tmp;
+}
+
+static inline void sort4 (int * num) {
+int tmp;
+  if (num[0] < num[1]) {
+    if (num[1] < num[2]) {
+      if (num[1] < num[3]) {
+        if (num[2] >= num[3]) {
+          tmp = num[2];
+          num[2] = num[3];
+          num[3] = tmp;
+        }
+      } else {
+        tmp = num[1];
+        if (num[0] < num[3]) {
+          num[1] = num[3];
+        } else {
+          num[1] = num[0];
+          num[0] = num[3];
+        }
+        num[3] = num[2];
+        num[2] = tmp;
+      }
+    } else {
+      if (num[0] < num[2]) {
+        if (num[2] < num[3]) {
+          if (num[1] < num[3]) {
+            tmp = num[1];
+          } else {
+            tmp = num[3];
+            num[3] = num[1];
+          }
+          num[1] = num[2];
+          num[2] = tmp;
+        } else {
+          if (num[0] < num[3]) {
+            tmp = num[3];
+          } else {
+            tmp = num[0];
+            num[0] = num[3];
+          }
+          num[3] = num[1];
+          num[1] = tmp;
+        }
+      } else {
+        if (num[0] < num[3]) {
+          tmp = num[0];
+          num[0] = num[2];
+          if (num[1] < num[3]) {
+            num[2] = num[1];
+          } else {
+            num[2] = num[3];
+            num[3] = num[1];
+          }
+          num[1] = tmp;
+        } else {
+          if (num[2] < num[3]) {
+            tmp = num[0];
+            num[0] = num[2];
+            num[2] = tmp;
+            tmp = num[1];
+            num[1] = num[3];
+          } else {
+            tmp = num[1];
+            num[1] = num[2];
+            num[2] = num[0];
+            num[0] = num[3];
+          }
+          num[3] = tmp;
+        }
+      }
+    }
+  } else {
+    tmp = num[0];
+    if (tmp < num[2]) {
+      if (tmp < num[3]) {
+        num[0] = num[1];
+        num[1] = tmp;
+        if (num[2] >= num[3]) {
+          tmp = num[2];
+          num[2] = num[3];
+          num[3] = tmp;
+        }
+      } else {
+        if (num[1] < num[3]) {
+          num[0] = num[1];
+          num[1] = num[3];
+        } else {
+          num[0] = num[3];
+        }
+        num[3] = num[2];
+        num[2] = tmp;
+      }
+    } else {
+      if (num[1] < num[2]) {
+        if (num[2] < num[3]) {
+          num[0] = num[1];
+          num[1] = num[2];
+          if (tmp < num[3]) {
+            num[2] = tmp;
+          } else {
+            num[2] = num[3];
+            num[3] = tmp;
+          }
+        } else {
+          if (num[1] < num[3]) {
+            num[0] = num[1];
+            num[1] = num[3];
+          } else {
+            num[0] = num[3];
+          }
+          num[3] = tmp;
+        }
+      } else {
+        if (num[1] < num[3]) {
+          num[0] = num[2];
+          if (tmp < num[3]) {
+            num[2] = tmp;
+          } else {
+            num[2] = num[3];
+            num[3] = tmp;
+          }
+        } else {
+          if (num[2] < num[3]) {
+            num[0] = num[2];
+            num[2] = num[1];
+            num[1] = num[3];
+            num[3] = tmp;
+          } else {
+            num[0] = num[3];
+            num[3] = tmp;
+            tmp = num[1];
+            num[1] = num[2];
+            num[2] = tmp;
+          }
+        }
+      }
+    }
+  }
+}
+
+static inline void sortAlt2 (int * numbers, int * altNumbers) {
+    if (numbers[0] <= numbers[1]) {
+        altNumbers[0] = numbers[0];
+        altNumbers[1] = numbers[1];
+    } else {
+        altNumbers[0] = numbers[1];
+        altNumbers[1] = numbers[0];
+    }
+}
+
+static inline void sortAlt3 (int * numbers, int * altNumbers) {
+    if (numbers[0] <= numbers[1]) {
+        if (numbers[1] <= numbers[2]) {
+            altNumbers[0] = numbers[0];
+            altNumbers[1] = numbers[1];
+            altNumbers[2] = numbers[2];
+        } else if (numbers[2] <= numbers[0]) {
+            altNumbers[0] = numbers[2];
+            altNumbers[1] = numbers[0];
+            altNumbers[2] = numbers[1];
+        } else {
+            altNumbers[0] = numbers[0];
+            altNumbers[1] = numbers[2];
+            altNumbers[2] = numbers[1];
+        }
+    } else {
+        if (numbers[0] <= numbers[2]) {
+            altNumbers[0] = numbers[1];
+            altNumbers[1] = numbers[0];
+            altNumbers[2] = numbers[2];
+        } else if (numbers[2] <= numbers[1]) {
+            altNumbers[0] = numbers[2];
+            altNumbers[1] = numbers[1];
+            altNumbers[2] = numbers[0];
+        } else {
+            altNumbers[0] = numbers[1];
+            altNumbers[1] = numbers[2];
+            altNumbers[2] = numbers[0];
+        }
+    }
+}
+
+/*
+ *  Insert Sort
+ */
+
+inline void insertSort (int numbers[], int qty) {
+int i, j, idx, q4;
+int tmp;
+
+    if (qty <= 4) {
+        if (qty == 4) sort4 (numbers);
+        else if (qty == 3) sort3 (numbers);
+        else if (qty == 2) sort2 (numbers);
+        return;
+    }
+
+    q4 = qty - 4;
+
+    for (i=0; i < q4; i++) {
+        idx = i;
+        for (j=i+1; j < qty; j++) {
+            if (numbers[j] < numbers[idx]) idx = j;
+        }
+        if (idx != i) {
+            tmp = numbers[idx];
+            numbers[idx] = numbers[i];
+            numbers[i] = tmp;
+        }
+    }
+
+    sort4 (numbers + q4);
+}
+
+/*
+ *  Heap Sort
+ */
+
+/* Assure the heap property for entries from top to last */
+static void siftDown (int numbers[], int top, int last) {
+int tmp = numbers[top];
+int maxIdx = top;
+
+    while (last >= (maxIdx += maxIdx)) {
+
+        /* This is where the comparison occurrs and where a sufficiently
+           good compiler can use a computed conditional result rather
+           than using control logic. */
+        if (maxIdx != last && numbers[maxIdx] < numbers[maxIdx + 1]) maxIdx++;
+
+        if (tmp >= numbers[maxIdx]) break;
+        numbers[top] = numbers[maxIdx];
+        top = maxIdx;
+    }
+    numbers[top] = tmp;
+}
+
+/* Peel off the top siftDown operation since its parameters are trivial to
+   fill in directly (and this saves us some moves.) */
+static void siftDown0 (int numbers[], int last) {
+int tmp;
+
+    if (numbers[0] < numbers[1]) {
+        tmp = numbers[1];
+        numbers[1] = numbers[0];
+        siftDown (numbers, 1, last);
+    } else {
+        tmp = numbers[0];
+    }
+    numbers[0] = numbers[last];
+    numbers[last] = tmp;
+}
+
+void heapSort (int numbers[], int qty) {
+int i;
+
+    if (qty <= 4) {
+        if (qty == 4) sort4 (numbers);
+        else if (qty == 3) sort3 (numbers);
+        else if (qty == 2) sort2 (numbers);
+        return;
+    }
+
+    i = qty / 2;
+    /* Enforce the heap property for each position in the tree */
+    for (  qty--; i >  0; i--) siftDown  (numbers, i, qty);
+    for (i = qty; i > 0; i--) siftDown0 (numbers, i);
+}
+
+/*
+ *  Quick Sort
+ */
+
+static int medianOf3 (int * numbers, int i, int j) {
+int tmp;
+
+    if (numbers[0] <= numbers[i]) {
+        if (numbers[j] <= numbers[0]) return numbers[0]; /* j 0 i */
+        if (numbers[i] <= numbers[j]) j = i;             /* 0 i j */
+                                                         /* 0 j i */
+    } else {
+        if (numbers[0] <= numbers[j]) return numbers[0]; /* i 0 j */
+        if (numbers[j] <= numbers[i]) j = i;             /* j i 0 */
+                                                         /* i j 0 */
+    }
+    tmp = numbers[j];
+    numbers[j] = numbers[0];
+    numbers[0] = tmp;
+    return tmp;
+}
+
+static void quickSortRecurse (int * numbers, int left, int right) {
+int pivot, lTmp, rTmp;
+
+    qsrStart:;
+
+#if defined(__GNUC__)
+    if (right <= left + 8) {
+        insertSort (numbers + left, right - left + 1);
+        return;
+    }
+#else
+    if (right <= left + 3) {
+        if (right == left + 1) {
+            sort2 (numbers + left);
+        } else if (right == left + 2) {
+            sort3 (numbers + left);
+        } else if (right == left + 3) {
+            sort4 (numbers + left);
+        }
+        return;
+    }
+#endif
+
+    lTmp = left;
+    rTmp = right;
+
+    pivot = medianOf3 (numbers + left, (right-left) >> 1, right-1-left);
+
+    goto QStart;
+    while (1) {
+        do {
+            right--;
+            if (left >= right) goto QEnd;
+            QStart:;
+        } while (numbers[right] > pivot);
+        numbers[left] = numbers[right];
+        do { 
+            left++;
+            if (left >= right) {
+                left = right;
+                goto QEnd;
+            }
+        } while (numbers[ left] < pivot);
+        numbers[right] = numbers[left];
+    }
+    QEnd:;
+    numbers[left] = pivot;
+
+    /* Only recurse the smaller partition */
+
+    if (left-1 - lTmp <= rTmp - left - 1) {
+        if (lTmp < left) quickSortRecurse (numbers,   lTmp, left-1);
+
+        /* Set up for larger partition */
+        left++;
+        right = rTmp;
+    } else {
+        if (rTmp > left) quickSortRecurse (numbers, left+1,   rTmp);
+
+        /* Set up for larger partition */
+        right = left - 1;
+        left = lTmp;
+    }
+
+    /* Rerun with larger partition (recursion not required.) */
+    goto qsrStart;
+}
+
+void quickSort (int numbers[], int qty) {
+    if (qty < 2) return;
+    quickSortRecurse (numbers, 0, qty - 1);
+}
+
+/*
+ *  Merge Sort
+ */
+
+static void mergesortInPlace (int * numbers, int * altNumbers, int qty);
+
+/* Perform mergesort, but store results in altNumbers */
+
+static void mergesortExchange (int * numbers, int * altNumbers, int qty) {
+int half, i0, i1, i;
+
+    if (qty == 2) {
+        sortAlt2 (numbers, altNumbers);
+        return;
+    }
+    if (qty == 3) {
+        sortAlt3 (numbers, altNumbers);
+        return;
+    }
+
+    half = (qty + 1)/2;
+
+    mergesortInPlace (numbers, altNumbers, half);
+    mergesortInPlace (numbers + half, altNumbers, qty - half);
+
+    i0 = 0; i1 = half;
+
+    for (i=0; i < qty; i++) {
+        if (i1 >= qty || (i0 < half && numbers[i0] < numbers[i1])) {
+            altNumbers[i] = numbers[i0];
+            i0++;
+        } else {
+            altNumbers[i] = numbers[i1];
+            i1++;
+        }
+    }
+}
+
+/* Perform mergesort and store results in numbers */
+
+static void mergesortInPlace (int * numbers, int * altNumbers, int qty) {
+int half, i0, i1, i;
+
+#if 0
+    if (qty == 2) {
+        sort2 (numbers);
+        return;
+    }
+    if (qty == 3) {
+        sort3 (numbers);
+        return;
+    }
+    if (qty == 4) {
+        sort4 (numbers);
+        return;
+    }
+#else
+    if (qty <= 12) {
+        insertSort (numbers, qty);
+        return;
+    }
+#endif
+
+    half = (qty + 1)/2;
+
+    mergesortExchange (numbers, altNumbers, half);
+    mergesortExchange (numbers + half, altNumbers + half, qty - half);
+
+    i0 = 0; i1 = half;
+
+    for (i=0; i < qty; i++) {
+        if (i1 >= qty || (i0 < half && altNumbers[i0] < altNumbers[i1])) {
+            numbers[i] = altNumbers[i0];
+            i0++;
+        } else {
+            numbers[i] = altNumbers[i1];
+            i1++;
+        }
+    }
+}
+
+#include <stdlib.h>
+
+void mergeSort (int numbers[], int qty) {
+int * tmpArray;
+
+    if (qty <= 12) {
+        insertSort (numbers, qty);
+        return;
+    }
+
+    tmpArray = (int *) malloc (qty * sizeof (int));
+    mergesortInPlace (numbers, tmpArray, qty);
+    free (tmpArray);
+}
+
+/********************************
+ * END OF PAUL'S IMPLEMENTATION *
+ ********************************/
+
+/*************************************************
+ *** Implementation 1: faster on sorted arrays ***
+ *************************************************/
+
+#define rstype_t unsigned
+#define rskey(x) (x)
+
+#define RS_MIN_SIZE 64
+
+typedef struct {
+	rstype_t *b, *e;
+} rsbucket_t;
+
+void rs_sort(rstype_t *beg, rstype_t *end, int n_bits, int s)
+{
+	rstype_t *i;
+	int size = 1<<n_bits, m = size - 1;
+	rsbucket_t *k, b[size], *be = b + size;
+
+	for (k = b; k != be; ++k) k->b = k->e = beg;
+	for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e;
+	for (k = b + 1; k != be; ++k)
+		k->e += (k-1)->e - beg, k->b = (k-1)->e;
+	for (k = b; k != be;) {
+		if (k->b != k->e) {
+			rsbucket_t *l;
+			if ((l = b + (rskey(*k->b)>>s&m)) != k) {
+				rstype_t tmp = *k->b, swap;
+				do {
+					swap = tmp; tmp = *l->b; *l->b++ = swap;
+					l = b + (rskey(tmp)>>s&m);
+				} while (l != k);
+				*k->b++ = tmp;
+			} else ++k->b;
+		} else ++k;
+	}
+	for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e;
+	if (s) {
+		s = s > n_bits? s - n_bits : 0;
+		for (k = b; k != be; ++k)
+			if (k->e - k->b > RS_MIN_SIZE) rs_sort(k->b, k->e, n_bits, s);
+			else if (k->e - k->b > 1)
+				for (i = k->b + 1; i < k->e; ++i)
+					if (rskey(*i) < rskey(*(i - 1))) {
+						rstype_t *j, tmp = *i;
+						for (j = i; j > k->b && rskey(tmp) < rskey(*(j-1)); --j)
+							*j = *(j - 1);
+						*j = tmp;
+					}
+	}
+}
+
+/*************************************************
+ *** Implementation 2: faster on random arrays ***
+ *************************************************/
+
+static inline void rs_insertsort(rstype_t *s, rstype_t *t)
+{
+	rstype_t *i;
+	for (i = s + 1; i < t; ++i) {
+		if (rskey(*i) < rskey(*(i - 1))) {
+			rstype_t *j, tmp = *i;
+			for (j = i; j > s && rskey(tmp) < rskey(*(j-1)); --j)
+				*j = *(j - 1);
+			*j = tmp;
+		}
+	}
+}
+/*
+void rs_sort2(rstype_t *beg, rstype_t *end, int n_bits, int s)
+{
+	int j, size = 1<<n_bits, m = size - 1;
+	unsigned long c[size];
+	rstype_t *i, *b[size], *e[size];
+
+	for (j = 0; j < size; ++j) c[j] = 0;
+	for (i = beg; i != end; ++i) ++c[rskey(*i)>>s&m];
+	b[0] = e[0] = beg;
+	for (j = 1; j != size; ++j) b[j] = e[j] = b[j - 1] + c[j - 1];
+	for (i = beg, j = 0; i != end;) {
+		rstype_t tmp = *i, swap;
+		int x;
+		for (;;) {
+			x = rskey(tmp)>>s&m;
+			if (e[x] == i) break;
+			swap = tmp; tmp = *e[x]; *e[x]++ = swap;
+		}
+		*i++ = tmp;
+		++e[x];
+		while (j != size && i >= b[j]) ++j;
+		while (j != size && e[j-1] == b[j]) ++j;
+		if (i < e[j-1]) i = e[j-1];
+	}
+	if (s) {
+		s = s > n_bits? s - n_bits : 0;
+		for (j = 0; j < size; ++j) {
+			if (c[j] >= RS_MIN_SIZE) rs_sort2(b[j], e[j], n_bits, s);
+			else if (c[j] >= 2) rs_insertsort(b[j], e[j]);
+		}
+	}
+}
+*/
+void radix_sort(unsigned *array, int offset, int end, int shift) {
+    int x, y, value, temp;
+    int last[256] = { 0 }, pointer[256];
+
+    for (x=offset; x<end; ++x) {
+        ++last[(array[x] >> shift) & 0xFF];
+    }
+
+    last[0] += offset;
+    pointer[0] = offset;
+    for (x=1; x<256; ++x) {
+        pointer[x] = last[x-1];
+        last[x] += last[x-1];
+    }
+
+    for (x=0; x<256; ++x) {
+        while (pointer[x] != last[x]) {
+            value = array[pointer[x]];
+            y = (value >> shift) & 0xFF;
+            while (x != y) {
+                temp = array[pointer[y]];
+                array[pointer[y]++] = value;
+                value = temp;
+                y = (value >> shift) & 0xFF;
+            }
+            array[pointer[x]++] = value;
+        }
+    }
+
+    if (shift > 0) {
+        shift -= 8;
+        for (x=0; x<256; ++x) {
+            temp = x > 0 ? pointer[x] - pointer[x-1] : pointer[0] - offset;
+            if (temp > 64) {
+                radix_sort(array, pointer[x] - temp, pointer[x], shift);
+            } else if (temp > 1) rs_insertsort(array + pointer[x] - temp, array + pointer[x]);
+        }
+    }
+}
+/*************************
+ *** END OF RADIX SORT ***
+ *************************/
+
+template< class _Type, unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold >
+inline void _RadixSort_Unsigned_PowerOf2Radix_1( _Type* a, long last, _Type bitMask, unsigned long shiftRightAmount )
+{
+	const unsigned long numberOfBins = PowerOfTwoRadix;
+	unsigned long count[ numberOfBins ];
+	for( unsigned long i = 0; i < numberOfBins; i++ )
+		count[ i ] = 0;
+	for ( long _current = 0; _current <= last; _current++ ) // Scan the array and count the number of times each value appears
+	{
+		unsigned long digit = (unsigned long)(( a[ _current ] & bitMask ) >> shiftRightAmount ); // extract the digit we are sorting based on
+		count[ digit ]++;
+	}
+	long startOfBin[ numberOfBins ], endOfBin[ numberOfBins ], nextBin;
+	startOfBin[ 0 ] = endOfBin[ 0 ] = nextBin = 0;
+	for( unsigned long i = 1; i < numberOfBins; i++ )
+		startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count[ i - 1 ];
+	for ( long _current = 0; _current <= last; )
+	{
+		unsigned long digit;
+		_Type tmp = a[ _current ];  // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location
+		while ( true ) {
+			digit = (unsigned long)(( tmp & bitMask ) >> shiftRightAmount );   // extract the digit we are sorting based on
+			if ( endOfBin[ digit ] == _current )
+				break;
+			_Type tmp2;
+			//_swap( tmp, a[ endOfBin[ digit ] ] );
+			tmp2 = a[endOfBin[digit]]; a[endOfBin[digit]] = tmp; tmp = tmp2;
+			endOfBin[ digit ]++;
+		}
+		a[ _current ] = tmp;
+		endOfBin[ digit ]++;   // leave the element at its location and grow the bin
+		_current++;  // advance the current pointer to the next element
+		while( _current >= startOfBin[ nextBin ] && nextBin < numberOfBins )
+			nextBin++;
+		while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] && nextBin < numberOfBins )
+			nextBin++;
+		if ( _current < endOfBin[ nextBin - 1 ] )
+			_current = endOfBin[ nextBin - 1 ];
+	}
+	bitMask >>= Log2ofPowerOfTwoRadix;
+	if ( bitMask != 0 )   // end recursion when all the bits have been processes
+	{
+		if ( shiftRightAmount >= Log2ofPowerOfTwoRadix ) shiftRightAmount -= Log2ofPowerOfTwoRadix;
+		else shiftRightAmount  = 0;
+		for( unsigned long i = 0; i < numberOfBins; i++ )
+		{
+			long numberOfElements = endOfBin[ i ] - startOfBin[ i ];
+			if ( numberOfElements >= Threshold )  // endOfBin actually points to one beyond the bin
+				_RadixSort_Unsigned_PowerOf2Radix_1< _Type, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &a[ startOfBin[ i ]], numberOfElements - 1, bitMask, shiftRightAmount );
+			else if ( numberOfElements >= 2 )
+				rs_insertsort(&a[ startOfBin[ i ]], &a[ endOfBin[ i ]]);
+		}
+	}
+}
+inline void RadixSortInPlace_HybridUnsigned_Radix256( unsigned* a, unsigned long a_size )
+{
+	if ( a_size < 2 ) return;
+	unsigned long bitMask = 0xFF000000; // bitMask controls how many bits we process at a time
+	unsigned long shiftRightAmount = 24;
+	if ( a_size >= 32 )
+		_RadixSort_Unsigned_PowerOf2Radix_1<unsigned, 256, 8, 32>(a, a_size - 1, bitMask, shiftRightAmount );
+	else
+		rs_insertsort(a, a + a_size);
+}
+
+struct intcmp_t {
+	inline int operator() (int a, int b) const {
+		return a < b? -1 : a > b? 1 : 0;
+	}
+};
+
+int compare_int(int a, int b)
+{
+	return a < b? -1 : a > b? 1 : 0;
+}
+int compare(const void *a, const void *b)
+{
+	return *((int*)a) - *((int*)b);
+}
+
+int main(int argc, char *argv[])
+{
+	int i, N = 50000000;
+	int *array, *temp;
+	clock_t t1, t2;
+	if (argc == 1) fprintf(stderr, "Usage: %s [%d]\n", argv[0], N);
+	if (argc > 1) N = atoi(argv[1]);
+	temp = (int*)malloc(sizeof(int) * N);
+	array = (int*)malloc(sizeof(int) * N);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	rs_sort((unsigned*)array, (unsigned*)array + N, 8, 24);
+	t2 = clock();
+	fprintf(stderr, "radix sort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in radix sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	rs_sort((unsigned*)array, (unsigned*)array + N, 8, 24);
+	t2 = clock();
+	fprintf(stderr, "radix sort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	RadixSortInPlace_HybridUnsigned_Radix256((unsigned*)array, N);
+//	radix_sort((unsigned*)array, 0, N, 24);
+	t2 = clock();
+	fprintf(stderr, "vd's radix sort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in radix sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	RadixSortInPlace_HybridUnsigned_Radix256((unsigned*)array, N);
+//	radix_sort((unsigned*)array, 0, N, 24);
+	t2 = clock();
+	fprintf(stderr, "vd's radix sort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	sort(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL introsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	t1 = clock();
+	sort(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL introsort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	stable_sort(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL stablesort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	t1 = clock();
+	stable_sort(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL stablesort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	make_heap(array, array+N);
+	sort_heap(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL heapsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in heap_sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	make_heap(array, array+N);
+	sort_heap(array, array+N);
+	t2 = clock();
+	fprintf(stderr, "STL heapsort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_combsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "combsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in combsort!\n");
+			exit(1);
+		}
+	}
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	qsort(array, N, sizeof(int), compare);
+	t2 = clock();
+	fprintf(stderr, "libc qsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_introsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "my introsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in intro_sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	ks_introsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "introsort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_mergesort(int, N, array, 0);
+	t2 = clock();
+	fprintf(stderr, "iterative mergesort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in merge_sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	ks_mergesort(int, N, array, 0);
+	t2 = clock();
+	fprintf(stderr, "iterative mergesort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	ks_heapmake(int, N, array);
+	ks_heapsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "my heapsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in heap_sort!\n");
+			exit(1);
+		}
+	}
+	t1 = clock();
+	ks_heapmake(int, N, array);
+	ks_heapsort(int, N, array);
+	t2 = clock();
+	fprintf(stderr, "heapsort (sorted): %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	heapSort(array, N);
+	t2 = clock();
+	fprintf(stderr, "Paul's heapsort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in intro_sort!\n");
+			exit(1);
+		}
+	}
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	quickSort(array, N);
+	t2 = clock();
+	fprintf(stderr, "Paul's quicksort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in intro_sort!\n");
+			exit(1);
+		}
+	}
+
+	srand48(11);
+	for (i = 0; i < N; ++i) array[i] = (int)lrand48();
+	t1 = clock();
+	mergeSort(array, N);
+	t2 = clock();
+	fprintf(stderr, "Paul's mergesort: %.3lf\n", (double)(t2-t1)/CLOCKS_PER_SEC);
+	for (i = 0; i < N-1; ++i) {
+		if (array[i] > array[i+1]) {
+			fprintf(stderr, "Bug in intro_sort!\n");
+			exit(1);
+		}
+	}
+
+	free(array); free(temp);
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kstring_bench.c b/web/server/h2o/libh2o/deps/klib/test/kstring_bench.c
new file mode 100644
index 000000000..82598e88c
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kstring_bench.c
@@ -0,0 +1,51 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "kstring.h"
+
+#define N 10000000
+
+int main()
+{
+	int i;
+	clock_t t;
+	kstring_t s, s2;
+	srand48(11);
+	s.l = s.m = 0; s.s = 0;
+	t = clock();
+	for (i = 0; i < N; ++i) {
+		int x = lrand48();
+		s.l = 0;
+		kputw(x, &s);
+	}
+	fprintf(stderr, "kputw: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+	srand48(11);
+	t = clock();
+	for (i = 0; i < N; ++i) {
+		int x = lrand48();
+		s.l = 0;
+		ksprintf(&s, "%d", x);
+	}
+	fprintf(stderr, "ksprintf: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+
+	srand48(11);
+	s2.l = s2.m = 0; s2.s = 0;
+	t = clock();
+	for (i = 0; i < N; ++i) {
+		int x = lrand48();
+		s2.l = s.l = 0;
+		kputw(x, &s2);
+		kputs(s2.s, &s);
+	}
+	fprintf(stderr, "kputw+kputs: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+	srand48(11);
+	t = clock();
+	for (i = 0; i < N; ++i) {
+		int x = lrand48();
+		s2.l = s.l = 0;
+		kputw(x, &s2);
+		ksprintf(&s, "%s", s2.s);
+	}
+	fprintf(stderr, "kputw+ksprintf: %lf\n", (double)(clock() - t) / CLOCKS_PER_SEC);
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kstring_bench2.c b/web/server/h2o/libh2o/deps/klib/test/kstring_bench2.c
new file mode 100644
index 000000000..b7707a8ec
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kstring_bench2.c
@@ -0,0 +1,131 @@
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "kstring.h"
+
+#ifdef __APPLE__
+#define HAVE_STRNSTR
+#endif
+
+#ifdef __linux__
+#define HAVE_MEMMEM
+#endif
+
+static int str_len = 1024*1024*128;
+static int pat_len = 30;
+static int alphabet = 2;
+static int repeat = 50;
+
+char *gen_data(int len, int a)
+{
+	char *data;
+	int i;
+	long x;
+	srand48(11);
+	data = malloc(len);
+	for (i = 0; i < len; ++i)
+		data[i] = (int)(a * drand48()) + '!';
+	data[str_len - 1] = 0;
+	return data;
+}
+// http://srcvault.scali.eu.org/cgi-bin/Syntax/c/BoyerMoore.c
+char *BoyerMoore( unsigned char *data, unsigned int dataLength, unsigned char *string, unsigned int strLength )
+{
+	unsigned int skipTable[256], i;
+	unsigned char *search;
+	register unsigned char lastChar;
+
+	if (strLength == 0)
+		return NULL;
+
+	for (i = 0; i < 256; i++)
+		skipTable[i] = strLength;
+	search = string;
+	i = --strLength;
+	do {
+		skipTable[*search++] = i;
+	} while (i--);
+	lastChar = *--search;
+	search = data + strLength;
+	dataLength -= strLength+(strLength-1);
+	while ((int)dataLength > 0 ) {
+		unsigned int skip;
+		skip = skipTable[*search];
+		search += skip;
+		dataLength -= skip;
+		skip = skipTable[*search];
+		search += skip;
+		dataLength -= skip;
+		skip = skipTable[*search];
+		if (*search != lastChar) {
+			search += skip;
+			dataLength -= skip;
+			continue;
+		}
+		i = strLength;
+		do {
+			if (i-- == 0) return search;
+		} while (*--search == string[i]);
+		search += (strLength - i + 1);
+		dataLength--;
+	}
+	return NULL;
+}
+
+int main()
+{
+	char *data;
+	int i;
+	clock_t t;
+	t = clock();
+	data = gen_data(str_len, alphabet);
+	fprintf(stderr, "Generate data in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	{
+		t = clock(); srand48(1331);
+		for (i = 0; i < repeat; ++i) {
+			int y = lrand48() % (str_len - pat_len);
+			char *ret;
+			ret = kmemmem(data, str_len, data + y, pat_len, 0);
+//			printf("%d, %d\n", (int)(ret - data), y);
+		}
+		fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+	if (1) {
+		t = clock(); srand48(1331);
+		for (i = 0; i < repeat; ++i) {
+			int y = lrand48() % (str_len - pat_len);
+			char *ret;
+			ret = BoyerMoore(data, str_len, data + y, pat_len);
+//			printf("%d, %d\n", (int)(ret - data), y);
+		}
+		fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+#ifdef HAVE_STRNSTR
+	if (1) {
+		char *tmp;
+		t = clock(); srand48(1331);
+		tmp = calloc(pat_len+1, 1);
+		for (i = 0; i < repeat; ++i) {
+			int y = lrand48() % (str_len - pat_len);
+			char *ret;
+			memcpy(tmp, data + y, pat_len);
+			ret = strnstr(data, tmp, str_len);
+		}
+		fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);		
+	}
+#endif
+#ifdef HAVE_MEMMEM
+	if (1) {
+		t = clock(); srand48(1331);
+		for (i = 0; i < repeat; ++i) {
+			int y = lrand48() % (str_len - pat_len);
+			char *ret;
+			ret = memmem(data, str_len, data + y, pat_len);
+//			printf("%d, %d\n", (int)(ret - data), y);
+		}
+		fprintf(stderr, "Search patterns in %.3f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
+	}
+#endif
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kstring_test.c b/web/server/h2o/libh2o/deps/klib/test/kstring_test.c
new file mode 100644
index 000000000..76f9532e7
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kstring_test.c
@@ -0,0 +1,76 @@
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "kstring.h"
+
+int nfail = 0;
+
+void check(const char *what, const kstring_t *ks, const char *correct)
+{
+	if (ks->l != strlen(correct) || strcmp(ks->s, correct) != 0) {
+		fprintf(stderr, "%s produced \"%.*s\" (\"%s\" is correct)\tFAIL\n", what, (int)(ks->l), ks->s, correct);
+		nfail++;
+	}
+}
+
+void test_kputw(kstring_t *ks, int n)
+{
+	char buf[16];
+
+	ks->l = 0;
+	kputw(n, ks);
+
+	sprintf(buf, "%d", n);
+	check("kputw()", ks, buf);
+}
+
+void test_kputl(kstring_t *ks, long n)
+{
+	char buf[24];
+
+	ks->l = 0;
+	kputl(n, ks);
+
+	sprintf(buf, "%ld", n);
+	check("kputl()", ks, buf);
+}
+
+int main()
+{
+	kstring_t ks;
+
+	ks.l = ks.m = 0;
+	ks.s = NULL;
+
+	test_kputw(&ks, 0);
+	test_kputw(&ks, 1);
+	test_kputw(&ks, 37);
+	test_kputw(&ks, 12345);
+	test_kputw(&ks, -12345);
+	test_kputw(&ks, INT_MAX);
+	test_kputw(&ks, -INT_MAX);
+	test_kputw(&ks, INT_MIN);
+
+	test_kputl(&ks, 0);
+	test_kputl(&ks, 1);
+	test_kputl(&ks, 37);
+	test_kputl(&ks, 12345);
+	test_kputl(&ks, -12345);
+	test_kputl(&ks, INT_MAX);
+	test_kputl(&ks, -INT_MAX);
+	test_kputl(&ks, INT_MIN);
+	test_kputl(&ks, LONG_MAX);
+	test_kputl(&ks, -LONG_MAX);
+	test_kputl(&ks, LONG_MIN);
+
+	free(ks.s);
+
+	if (nfail > 0) {
+		fprintf(stderr, "Total failures: %d\n", nfail);
+		return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kthread_test.c b/web/server/h2o/libh2o/deps/klib/test/kthread_test.c
new file mode 100644
index 000000000..1b67ed4ea
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kthread_test.c
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <pthread.h>
+#if HAVE_CILK
+#include <cilk/cilk.h>
+#include <cilk/cilk_api.h>
+#endif
+
+typedef struct {
+	int max_iter, w, h;
+	double xmin, xmax, ymin, ymax;
+	int *k;
+} global_t;
+
+static void compute(void *_g, int i, int tid)
+{
+	global_t *g = (global_t*)_g;
+	double x, x0 = g->xmin + (g->xmax - g->xmin) * (i%g->w) / g->w;
+	double y, y0 = g->ymin + (g->ymax - g->ymin) * (i/g->w) / g->h;
+	int k;
+
+	assert(g->k[i] < 0);
+	x = x0, y = y0;
+	for (k = 0; k < g->max_iter; ++k) {
+		double z = x * y;
+		x *= x; y *= y;
+		if (x + y >= 4) break;
+		x = x - y + x0;
+		y = z + z + y0; 
+	}
+	g->k[i] = k;
+}
+
+void kt_for(int n_threads, int n_items, void (*func)(void*,int,int), void *data);
+
+int main(int argc, char *argv[])
+{
+	int i, tmp, tot, type = 0, n_threads = 2;
+	global_t global = { 10240*100, 800, 600, -2., -1.2, -1.2, 1.2, 0 };
+//	global_t global = { 10240*1, 8, 6, -2., -1.2, -1.2, 1.2, 0 };
+
+	if (argc > 1) {
+		type = argv[1][0] == 'o'? 2 : argv[1][0] == 'c'? 3 : argv[1][0] == 'n'? 1 : 0;
+		if (argv[1][0] >= '0' && argv[1][0] <= '9')
+			n_threads = atoi(argv[1]);
+	} else {
+		fprintf(stderr, "Usage: ./a.out [openmp | cilk | #threads]\n");
+	}
+	tot = global.w * global.h;
+	global.k = calloc(tot, sizeof(int));
+	for (i = 0; i < tot; ++i) global.k[i] = -1;
+	if (type == 0) {
+		kt_for(n_threads, tot, compute, &global);
+	} else if (type == 2) {
+		#pragma omp parallel for
+		for (i = 0; i < tot; ++i)
+			compute(&global, i, 0);
+	} else if (type == 3) {
+		#if HAVE_CILK
+		cilk_for (i = 0; i < tot; ++i)
+			compute(&global, i, 0);
+		#endif
+	}
+	for (i = tmp = 0; i < tot; ++i) tmp += (global.k[i] < 0);
+	free(global.k);
+	assert(tmp == 0);
+	return 0;
+}
diff --git a/web/server/h2o/libh2o/deps/klib/test/kvec_test.cc b/web/server/h2o/libh2o/deps/klib/test/kvec_test.cc
new file mode 100644
index 000000000..1015574e4
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/klib/test/kvec_test.cc
@@ -0,0 +1,69 @@
+#include <vector>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "kvec.h"
+
+int main()
+{
+	int M = 10, N = 20000000, i, j;
+	clock_t t;
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		int *array = (int*)malloc(N * sizeof(int));
+		for (j = 0; j < N; ++j) array[j] = j;
+		free(array);
+	}
+	printf("C array, preallocated: %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		int *array = 0, max = 0;
+		for (j = 0; j < N; ++j) {
+			if (j == max) {
+				max = !max? 1 : max << 1;
+				array = (int*)realloc(array, sizeof(int)*max);
+			}
+			array[j] = j;
+		}
+		free(array);
+	}
+	printf("C array, dynamic: %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		kvec_t(int) array;
+		kv_init(array);
+		kv_resize(int, array, N);
+		for (j = 0; j < N; ++j) kv_a(int, array, j) = j;
+		kv_destroy(array);
+	}
+	printf("C vector, dynamic(kv_a): %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		kvec_t(int) array;
+		kv_init(array);
+		for (j = 0; j < N; ++j)
+			kv_push(int, array, j);
+		kv_destroy(array);
+	}
+	printf("C vector, dynamic(kv_push): %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		std::vector<int> array;
+		array.reserve(N);
+		for (j = 0; j < N; ++j) array[j] = j;
+	}
+	printf("C++ vector, preallocated: %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	t = clock();
+	for (i = 0; i < M; ++i) {
+		std::vector<int> array;
+		for (j = 0; j < N; ++j) array.push_back(j);
+	}
+	printf("C++ vector, dynamic: %.3f sec\n",
+		   (float)(clock() - t) / CLOCKS_PER_SEC);
+	return 0;
+}