summaryrefslogtreecommitdiffstats
path: root/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/.gitignore3
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/Makefile184
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/analyse.py207
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/boot.c144
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/curve25519-results.txt22
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/ext/cutest.h55
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.efm32.ld8
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.lm3s6965evb.ld7
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.qemucm3.ld8
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.std.ld172
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f0.ld8
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f1.ld8
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f3.ld8
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/main.c447
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memcpy.s49
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memset.s50
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/merge.py26
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.efm32.cfg3
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f0.cfg3
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f1.cfg3
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f3.cfg3
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/report.py276
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.c170
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.h40
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.s15
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_mpy121666.s199
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_reduce25519.s176
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/mul.s1109
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/scalarmult.c761
-rw-r--r--web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/sqr.s777
30 files changed, 4941 insertions, 0 deletions
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/.gitignore b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/.gitignore
new file mode 100644
index 00000000..5841cb8e
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/.gitignore
@@ -0,0 +1,3 @@
+*.log
+*.elf
+*.bin
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/Makefile b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/Makefile
new file mode 100644
index 00000000..7be9a53a
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/Makefile
@@ -0,0 +1,184 @@
+FUNCS = do_nothing stack_8w stack_64w \
+ hashtest_sha256 hashtest_sha512 \
+ hashtest_sha3_256 hashtest_sha3_512 \
+ aes128block_test aes128sched_test \
+ aes256block_test aes256sched_test \
+ aes128gcm_test aes128eax_test \
+ aes128ccm_test \
+ salsa20_test chacha20_test \
+ poly1305_test hmacsha256_test \
+ curve25519_test \
+ norx_test
+
+AEADS = aeadperf_aes128gcm \
+ aeadperf_aes128ccm \
+ aeadperf_aes128eax \
+ aeadperf_aes256gcm \
+ aeadperf_aes256ccm \
+ aeadperf_aes256eax \
+ aeadperf_norx \
+ aeadperf_chacha20poly1305
+TESTS = testcurve25519 testaes testmodes testsalsa20 testsha1 testsha2 \
+ testsha3 testpoly1305 testnorx testchacha20poly1305 testdrbg
+ARCHS = stm32f0 stm32f1 stm32f3 efm32 qemucm3
+
+all: $(patsubst %,%.stm32f0.bin,$(FUNCS) $(AEADS) $(TESTS)) \
+ $(patsubst %,%.stm32f1.bin,$(FUNCS) $(AEADS) $(TESTS)) \
+ $(patsubst %,%.stm32f3.bin,$(FUNCS) $(AEADS) $(TESTS)) \
+ $(patsubst %,%.efm32.bin,$(FUNCS) $(AEADS) $(TESTS)) \
+ $(patsubst %,%.qemucm3.bin,$(FUNCS) $(AEADS) $(TESTS))
+
+%.stm32f0.elf:
+ arm-none-eabi-gcc $(CFLAGS) $(CFLAGS_$*) $(LDFLAGS) -T linkscript.stm32f0.ld -mcpu=cortex-m0 -DCORTEX_M0 -o $@ $^ -DTEST=$* -lgcc
+
+%.stm32f1.elf:
+ arm-none-eabi-gcc $(CFLAGS) $(CFLAGS_$*) $(LDFLAGS) -T linkscript.stm32f1.ld -mcpu=cortex-m3 -DCORTEX_M3 -o $@ $^ -DTEST=$* -lgcc
+
+%.stm32f3.elf:
+ arm-none-eabi-gcc $(CFLAGS) $(CFLAGS_$*) $(LDFLAGS) -T linkscript.stm32f3.ld -mcpu=cortex-m4 -DCORTEX_M4 -o $@ $^ -DTEST=$* -lgcc
+
+%.efm32.elf:
+ arm-none-eabi-gcc $(CFLAGS) $(CFLAGS_$*) $(LDFLAGS) -T linkscript.efm32.ld -mcpu=cortex-m0 -DCORTEX_M0 -o $@ $^ -DTEST=$* -lgcc
+
+%.qemucm3.elf:
+ arm-none-eabi-gcc $(CFLAGS) $(CFLAGS_$*) $(LDFLAGS) -T linkscript.qemucm3.ld -mcpu=cortex-m3 -DCORTEX_M3 -o $@ $^ -DTEST=$* -lgcc
+
+%.bin: %.elf
+ arm-none-eabi-objcopy -O binary $< $@
+.PRECIOUS: %.bin
+
+AES_OPTIONS = -DCF_AES_ENCRYPT_ONLY=1 -DCF_SIDE_CHANNEL_PROTECTION=0
+AES128_OPTIONS = -DCF_AES_MAXROUNDS=AES128_ROUNDS
+AES256_OPTIONS = -DCF_AES_MAXROUNDS=AES256_ROUNDS
+
+AEADPERF_BRACKET = -DBRACKET_MODE=1 -DBRACKET_START=0 -DBRACKET_END=256 -DBRACKET_STEP=4
+
+CFLAGS_aes128block_test = $(AES_OPTIONS) $(AES128_OPTIONS)
+CFLAGS_aes128sched_test = $(AES_OPTIONS) $(AES128_OPTIONS)
+CFLAGS_aes128gcm_test = $(AES_OPTIONS) $(AES128_OPTIONS)
+CFLAGS_aes128eax_test = $(AES_OPTIONS) $(AES128_OPTIONS)
+CFLAGS_aes128ccm_test = $(AES_OPTIONS) $(AES128_OPTIONS)
+CFLAGS_poly1305_test = $(AES_OPTIONS) $(AES128_OPTIONS)
+
+CFLAGS_aeadperf_aes128gcm = $(AES_OPTIONS) $(AES128_OPTIONS) $(AEADPERF_BRACKET)
+CFLAGS_aeadperf_aes128eax = $(AES_OPTIONS) $(AES128_OPTIONS) $(AEADPERF_BRACKET)
+CFLAGS_aeadperf_aes128ccm = $(AES_OPTIONS) $(AES128_OPTIONS) $(AEADPERF_BRACKET)
+CFLAGS_aeadperf_aes256gcm = $(AES_OPTIONS) $(AES256_OPTIONS) $(AEADPERF_BRACKET)
+CFLAGS_aeadperf_aes256eax = $(AES_OPTIONS) $(AES256_OPTIONS) $(AEADPERF_BRACKET)
+CFLAGS_aeadperf_aes256ccm = $(AES_OPTIONS) $(AES256_OPTIONS) $(AEADPERF_BRACKET)
+CFLAGS_aeadperf_norx = $(AEADPERF_BRACKET)
+CFLAGS_aeadperf_chacha20poly1305 = $(AEADPERF_BRACKET)
+
+CFLAGS_aes256block_test = $(AES_OPTIONS) $(AES256_OPTIONS)
+CFLAGS_aes256sched_test = $(AES_OPTIONS) $(AES256_OPTIONS)
+
+CFLAGS_testaes = -DCF_SIDE_CHANNEL_PROTECTION=0
+
+CFLAGS = -I./ext -I../ext -I.. -Os -ffunction-sections -g \
+ -Wall -Werror -std=gnu99 -mthumb
+LDFLAGS = -nostartfiles -nostdlib -Wl,-gc-sections
+CURVESRCS = unacl/cortex_m0_mpy121666.s unacl/cortex_m0_reduce25519.s unacl/mul.s unacl/sqr.s
+SRCS = boot.c memcpy.s memset.s semihost.c semihost.s \
+ ../sha1.c ../sha256.c ../sha512.c ../sha3.c ../blockwise.c ../chash.c \
+ ../curve25519.c ../poly1305.c \
+ ../aes.c ../eax.c ../gcm.c ../cbcmac.c ../ccm.c \
+ ../modes.c ../cmac.c ../gf128.c \
+ ../hmac.c ../pbkdf2.c ../salsa20.c ../chacha20.c \
+ ../norx.c ../chacha20poly1305.c ../drbg.c
+
+$(patsubst %,%.stm32f0.elf, $(FUNCS) $(AEADS)): $(SRCS) main.c $(CURVESRCS)
+$(patsubst %,%.stm32f1.elf, $(FUNCS) $(AEADS)): $(SRCS) main.c $(CURVESRCS)
+$(patsubst %,%.stm32f3.elf, $(FUNCS) $(AEADS)): $(SRCS) main.c $(CURVESRCS)
+$(patsubst %,%.efm32.elf, $(FUNCS) $(AEADS)): $(SRCS) main.c $(CURVESRCS)
+$(patsubst %,%.qemucm3.elf, $(FUNCS) $(AEADS)): $(SRCS) main.c $(CURVESRCS)
+
+$(patsubst %,testcurve25519.%.elf, $(ARCHS)): $(SRCS) $(CURVESRCS) ../testcurve25519.c
+$(patsubst %,testaes.%.elf, $(ARCHS)): $(SRCS) ../testaes.c
+$(patsubst %,testmodes.%.elf, $(ARCHS)): $(SRCS) ../testmodes.c
+$(patsubst %,testsalsa20.%.elf, $(ARCHS)): $(SRCS) ../testsalsa20.c
+$(patsubst %,testsha1.%.elf, $(ARCHS)): $(SRCS) ../testsha1.c
+$(patsubst %,testsha2.%.elf, $(ARCHS)): $(SRCS) ../testsha2.c
+$(patsubst %,testsha3.%.elf, $(ARCHS)): $(SRCS) ../testsha3.c
+$(patsubst %,testpoly1305.%.elf, $(ARCHS)): $(SRCS) ../testpoly1305.c
+$(patsubst %,testnorx.%.elf, $(ARCHS)): $(SRCS) ../testnorx.c
+$(patsubst %,testchacha20poly1305.%.elf, $(ARCHS)): $(SRCS) ../testchacha20poly1305.c
+$(patsubst %,testdrbg.%.elf, $(ARCHS)): $(SRCS) ../testdrbg.c
+
+run.%.qemucm3: %.qemucm3.bin
+ arm-none-eabi-readelf -l $(patsubst %.bin,%.elf,$^) > $@.log
+ qemu-system-gnuarmeclipse -verbose -verbose -M STM32-P103 -kernel $^ -semihosting -nographic -monitor null -serial null 2>> $@.log
+ cat $@.log
+
+run.%.efm32: %.efm32.elf
+ arm-none-eabi-readelf -l $^ > $@.log
+ echo '-----' >> $@.log
+ openocd -f openocd.efm32.cfg >> $@.log &
+ arm-none-eabi-gdb --quiet --batch-silent \
+ $^ \
+ -ex 'target remote :3333' \
+ -ex 'monitor reset halt' \
+ -ex 'load' \
+ -ex 'monitor arm semihosting enable' \
+ -ex 'monitor reset run' \
+ -ex 'monitor wait_halt 720000' \
+ -ex 'monitor shutdown'
+
+run.%.stm32f0: %.stm32f0.elf
+ arm-none-eabi-readelf -l $^ > $@.log
+ echo '-----' >> $@.log
+ openocd -f openocd.stm32f0.cfg >> $@.log &
+ arm-none-eabi-gdb --quiet --batch-silent \
+ $^ \
+ -ex 'target remote :3333' \
+ -ex 'monitor reset halt' \
+ -ex 'load' \
+ -ex 'monitor arm semihosting enable' \
+ -ex 'monitor reset run' \
+ -ex 'monitor wait_halt 720000' \
+ -ex 'monitor shutdown'
+
+run.%.stm32f1: %.stm32f1.elf
+ arm-none-eabi-readelf -l $^ > $@.log
+ echo '-----' >> $@.log
+ openocd -f openocd.stm32f1.cfg >> $@.log &
+ arm-none-eabi-gdb --quiet --batch-silent \
+ $^ \
+ -ex 'target remote :3333' \
+ -ex 'monitor reset halt' \
+ -ex 'load' \
+ -ex 'monitor arm semihosting enable' \
+ -ex 'monitor reset run' \
+ -ex 'monitor wait_halt 720000' \
+ -ex 'monitor shutdown'
+
+run.%.stm32f3: %.stm32f3.elf
+ arm-none-eabi-readelf -l $^ > $@.log
+ echo '-----' >> $@.log
+ openocd -f openocd.stm32f3.cfg >> $@.log &
+ arm-none-eabi-gdb --quiet --batch-silent \
+ $^ \
+ -ex 'target remote :3333' \
+ -ex 'monitor reset halt' \
+ -ex 'load' \
+ -ex 'monitor arm semihosting enable' \
+ -ex 'monitor reset run' \
+ -ex 'monitor wait_halt 720000' \
+ -ex 'monitor shutdown'
+
+test: $(patsubst %,run.%.qemucm3,$(FUNCS) $(TESTS))
+.PHONY: test
+
+perf.stm32f0: $(patsubst %,run.%.stm32f0,$(FUNCS))
+.PHONY: perf.stm32f0
+
+test.stm32f0: $(patsubst %,run.%.stm32f0,$(FUNCS) $(TESTS))
+.PHONY: test.stm32f0
+
+test.stm32f1: $(patsubst %,run.%.stm32f1,$(FUNCS) $(TESTS))
+.PHONY: test.stm32f1
+
+test.stm32f3: $(patsubst %,run.%.stm32f3,$(FUNCS) $(TESTS))
+.PHONY: test.stm32f3
+
+clean:
+ rm -rf *.log *.elf *.bin
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/analyse.py b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/analyse.py
new file mode 100644
index 00000000..d2c456fe
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/analyse.py
@@ -0,0 +1,207 @@
+import subprocess
+import sys
+import re
+
+function_intro_re = re.compile(r'^(?P<addr>[0-9a-fA-F]{8}) <(?P<name>[a-zA-Z0-9\._]+)>:$')
+insn_re = re.compile(r'^\s+(?P<addr>[0-9a-fA-F]+):\s+(?P<insn>[0-9a-fA-F ]+)\s+\t(?P<op>.*)$')
+
+class Instruction:
+ def __init__(self, addr, insn, op):
+ self.addr = long(addr, 16)
+ self.insn = insn
+
+ args = op.split('\t', 1)
+
+ self.op = args[0].strip()
+ if len(args) == 2:
+ comment = args[1].strip().split(';', 1)
+ else:
+ comment = args
+
+ self.args = comment[0].strip()
+
+ if len(comment) == 2:
+ self.comment = comment[1].strip()
+ else:
+ self.comment = ''
+
+ def __repr__(self):
+ return '<insn %r>' % (self.__dict__)
+
+
+def literal_branch_target(t):
+ return ' <' in t
+
+class Function:
+ def __init__(self, addr, name):
+ self.name = name
+ self.addr = long(addr, 16)
+ self.insns = []
+ self.calls = []
+
+ def __repr__(self):
+ return '<%s %d instructions>' % (self.name, len(self.insns))
+
+ def add_insn(self, insn):
+ self.insns.append(Instruction(**insn))
+
+ def contains_addr(self, addr):
+ if self.insns:
+ return addr >= self.addr and addr <= self.insns[-1].addr
+ else:
+ return addr == self.addr
+
+ def dump(self):
+ print self.name + ':'
+ for insn in self.insns:
+ print ' ', '%04x' % insn.addr + ':', insn.op, insn.args, '\t;', insn.comment
+
+ def get_literal_word(self, addr):
+ for insn in self.insns:
+ if insn.addr == addr and insn.op == '.word':
+ w = int(insn.args, 16)
+ if w & 0x80000000:
+ w = -(w ^ 0xffffffff) + 1
+ return w
+ return None
+
+ def analyse(self, prog):
+ self.stack_guess = None
+ regs = {}
+
+ for insn in self.insns:
+ # stack adjustment with literal
+ if insn.op == 'sub' and insn.args.startswith('sp, ') and self.stack_guess is None:
+ sz = int(insn.args.split('#', 1)[1])
+ self.stack_guess = sz
+
+ # literal pool loads
+ if insn.op == 'ldr' and ', [pc, #' in insn.args:
+ reg, offset = insn.args.split(', [pc, #')
+ offset = int(offset.replace(']', ''))
+ word = self.get_literal_word(insn.addr + offset + 2)
+ if word is not None:
+ regs[reg] = word
+
+ if insn.op == 'add' and insn.args.startswith('sp, r') and self.stack_guess is None:
+ reg = insn.args.split(', ')[1]
+ if reg in regs:
+ self.stack_guess = regs[reg]
+
+ # static branches
+ if insn.op[0] == 'b' and literal_branch_target(insn.args):
+ target = long(insn.args.split(' <', 1)[0], 16)
+
+ targetf = prog.function_at_addr(target)
+
+ if targetf and targetf != self:
+ self.calls.append(targetf)
+
+ if self.stack_guess is None:
+ self.stack_guess = 0
+
+ def stack_usage(self, hints, warns, prog, depth = 0):
+ hinted_calls = []
+ if self.stack_guess:
+ print ' ' * depth, 'stack:', self.name, self.stack_guess, 'bytes'
+
+ our_hints = [h for h in hints if h and h[0] == self.name]
+ if our_hints:
+ hints = [h[1:] for h in our_hints]
+ hinted_calls = [prog.function_by_name(h[0]) for h in hints if h]
+ else:
+ if self.name in warns:
+ print ' WARN: no calls hints for fn-ptr caller', self.name
+
+ if self.calls + hinted_calls:
+ call_usage = max([f.stack_usage(hints, warns, prog, depth + 1) for f in self.calls + hinted_calls])
+ else:
+ call_usage = 0
+ return self.stack_guess + call_usage
+
+class Program:
+ def __init__(self):
+ self.functions = []
+
+ # sequence of tuples naming a call sequence known to occur
+ # this allows working out calls through pointers
+ self.call_hints = []
+
+ # function names to warn on if we don't have callees
+ self.call_warns = set()
+
+ def read_elf(self, elf):
+ current_fn = None
+
+ for x in subprocess.Popen(['arm-none-eabi-objdump', '-d', elf],
+ stdout = subprocess.PIPE).stdout:
+ x = x.rstrip('\n')
+ m = function_intro_re.match(x)
+ if m:
+ fn = Function(**m.groupdict())
+ current_fn = fn
+ self.functions.append(fn)
+
+ m = insn_re.match(x)
+ if m:
+ assert current_fn
+ current_fn.add_insn(m.groupdict())
+
+ def analyse(self):
+ for f in self.functions:
+ f.analyse(self)
+
+ def function_by_name(self, name):
+ fns = [fn for fn in self.functions if fn.name == name]
+ if len(fns) == 0:
+ return None
+ elif len(fns) == 1:
+ return fns[0]
+ else:
+ print 'warn: more than one function named', name
+ return None
+
+ def function_at_addr(self, addr):
+ for f in self.functions:
+ if f.addr == addr:
+ return f
+ return None
+
+ def add_call_hint(self, *seq):
+ self.call_hints.append(seq)
+
+ def add_call_warn(self, fn):
+ self.call_warns.add(fn)
+
+ def measure_stack(self, name):
+ fn = self.function_by_name(name)
+ if fn is None:
+ return 0
+
+ return fn.stack_usage(self.call_hints, self.call_warns, self)
+
+_, exe, fn = sys.argv
+
+p = Program()
+p.read_elf(exe)
+
+p.analyse()
+
+# calls which indirect through fn ptrs
+p.add_call_warn('cf_blockwise_accumulate')
+p.add_call_warn('cf_blockwise_accumulate_final')
+
+# hints to resolve those
+p.add_call_hint('cf_sha224_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'sha256_update_block')
+p.add_call_hint('cf_sha256_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'sha256_update_block')
+p.add_call_hint('cf_sha384_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'sha512_update_block')
+p.add_call_hint('cf_sha512_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'sha512_update_block')
+p.add_call_hint('cf_norx32_encrypt', 'input', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'input_block')
+p.add_call_hint('cf_norx32_decrypt', 'input', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'input_block')
+p.add_call_hint('cf_cbcmac_stream_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'cbcmac_process')
+p.add_call_hint('cf_cmac_stream_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'cmac_process_final_pad')
+p.add_call_hint('cf_cmac_stream_update', 'cf_blockwise_accumulate_final', 'cmac_process')
+p.add_call_hint('cf_cmac_stream_update', 'cf_blockwise_accumulate_final', 'cmac_process_final_nopad')
+
+
+print 'stack', fn, '=', p.measure_stack(fn)
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/boot.c b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/boot.c
new file mode 100644
index 00000000..d2a8e407
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/boot.c
@@ -0,0 +1,144 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern int main(void);
+
+/* --- Defined by link script --- */
+extern uint32_t __etext; /* End of text/start of data. */
+extern uint32_t __data_start__, __data_end__; /* Data addresses in RAM */
+extern uint32_t __bss_start__, __bss_end__; /* BSS addresses in RAM */
+extern uint32_t __StackTop; /* End of stack in RAM */
+
+#define ATTR_SECTION(sec) __attribute__ ((section (sec)))
+
+/* --- Interrupt vector table. --- */
+void Reset_Handler(void);
+void SysTick_Handler(void);
+void infinite_loop(void);
+void do_nothing(void);
+
+typedef void (*vector_fn)(void);
+
+typedef struct {
+ uint32_t *stack_top;
+ vector_fn reset, nmi, hard_fault, mmu_fault, bus_fault, usage_fault;
+ vector_fn reserved0[4];
+ vector_fn svc, debug_monitor;
+ vector_fn reserved1;
+ vector_fn pendsv, systick;
+ vector_fn irq[128];
+} vectors_t;
+
+#define COPY2(v) v, v
+#define COPY4(v) COPY2(v), COPY2(v)
+#define COPY8(v) COPY4(v), COPY4(v)
+#define COPY16(v) COPY8(v), COPY8(v)
+#define COPY32(v) COPY16(v), COPY16(v)
+#define COPY64(v) COPY32(v), COPY32(v)
+#define COPY128(v) COPY64(v), COPY64(v)
+
+vectors_t vectors ATTR_SECTION(".isr_vector") = {
+ .stack_top = &__StackTop,
+ .reset = Reset_Handler,
+ .nmi = do_nothing,
+ .hard_fault = infinite_loop,
+ .mmu_fault = infinite_loop,
+ .bus_fault = infinite_loop,
+ .usage_fault = infinite_loop,
+ .svc = do_nothing,
+ .debug_monitor = do_nothing,
+ .pendsv = do_nothing,
+ .systick = SysTick_Handler,
+ .irq = { COPY128(do_nothing) }
+};
+
+/* --- ISRs --- */
+void Reset_Handler(void)
+{
+ /* Copy data segment contents from flash to RAM. */
+ uint32_t data_bytes = (&__data_end__ - &__data_start__) * 4;
+ memcpy(&__etext, &__data_start__, data_bytes);
+
+ /* Zero BSS. */
+ uint32_t bss_bytes = (&__bss_end__ - &__bss_start__) * 4;
+ memset(&__bss_start__, 0, bss_bytes);
+
+ main();
+ while (1)
+ ;
+}
+
+void __assert_func(const char *file, int line, const char *func, const char *expr)
+{
+ while (1)
+ ;
+}
+
+void infinite_loop(void)
+{
+ while (1)
+ ;
+}
+
+void do_nothing(void)
+{
+}
+
+uint32_t ticks = 0;
+
+void SysTick_Handler(void)
+{
+ ticks++;
+}
+
+uint32_t get_ticks(void)
+{
+ return ticks;
+}
+
+void reset_ticks(void)
+{
+ ticks = 0;
+}
+
+void *memmove(void *vtarg, const void *vsrc, size_t len)
+{
+ if (vsrc > vtarg)
+ return memcpy(vtarg, vsrc, len);
+ else if (vsrc == vtarg)
+ return vtarg;
+
+ uint8_t *targ = vtarg;
+ const uint8_t *src = vsrc;
+
+ for (size_t i = len; i != 0; i++)
+ targ[i - 1] = src[i - 1];
+ return vtarg;
+}
+
+int memcmp(const void *va, const void *vb, size_t len)
+{
+ const uint8_t *a = va, *b = vb;
+
+ for (size_t i = 0; i < len; i++)
+ {
+ if (a[i] != b[i])
+ return a[i] < b[i] ? -1 : 1;
+ }
+
+ return 0;
+}
+
+size_t strlen(const char *c)
+{
+ size_t r = 0;
+ while (*c++) r++;
+ return r;
+}
+
+void abort(void)
+{
+ while (1)
+ ;
+}
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/curve25519-results.txt b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/curve25519-results.txt
new file mode 100644
index 00000000..968e40e5
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/curve25519-results.txt
@@ -0,0 +1,22 @@
+STM32F0
+donna-before-opt: 12907000c
+donna-after-opt: 17294000c
+donna-reset-opt: 12947000c
+~20k
+
+donna -O2 -Os: 15268000c
+donna -O2 -Os noasm: 20453000c
+donna -Os: 15748000c
+7.4k
+
+donna -O3: 12907000c 16KB 3380b
+donna -Os: 15748000c 7.4KB 3148b
+donna -O2: 15218000c 7.9KB 3148b
+
+tweetnacl -O2: 68876000c 3.0KB 2268b
+tweetnacl -Os: 75979000c 2.8KB 2244b
+tweetnacl -O3: 69622000c 8.9KB 2900b
+
+naclref -Os: 47813000c 3.2KB 4012b
+naclref -O3: 35059000c 4.1KB 4044b
+naclref -O2: 34309000c 3.5KB 4036b
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/ext/cutest.h b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/ext/cutest.h
new file mode 100644
index 00000000..fa3c5d84
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/ext/cutest.h
@@ -0,0 +1,55 @@
+/* cutest, for embedded targets. */
+
+#ifndef CUTEST_H
+#define CUTEST_H
+
+/* Main interface. */
+#define TEST_LIST const struct test__ test_list__[]
+#define TEST_CHECK(cond) test_check__((cond), __FILE__, __LINE__, #cond)
+/* no TEST_CHECK_ -- we don't have a good enough printf */
+
+/* Implementation */
+#include "../semihost.h"
+
+struct test__
+{
+ const char *name;
+ void (*func)(void);
+};
+
+extern const struct test__ test_list__[];
+
+static void test_check__(int cond, const char *file, int line, const char *expr)
+{
+ if (cond)
+ return; /* pass */
+
+ emit("Failed!\n");
+ emit("File: "); emit(file); emit("\n");
+ emit("Line: "); emit_uint32(line); emit("\n");
+ emit("Expr: "); emit(expr); emit("\n");
+ quit_failure();
+}
+
+static void run_test__(const struct test__ *t)
+{
+ emit(" "); emit(t->name); emit(": ");
+ t->func();
+ emit("OK\n");
+}
+
+int main(void)
+{
+ emit("Running tests:\n");
+
+ for (const struct test__ *t = test_list__;
+ t->name;
+ t++)
+ {
+ run_test__(t);
+ }
+ emit("Success\n");
+ quit_success();
+}
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.efm32.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.efm32.ld
new file mode 100644
index 00000000..8b9a6bfd
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.efm32.ld
@@ -0,0 +1,8 @@
+MEMORY
+{
+ FLASH (rx) : ORIGIN = 0x00000000, LENGTH = 64K
+ RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 8K
+}
+
+INCLUDE linkscript.std.ld
+
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.lm3s6965evb.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.lm3s6965evb.ld
new file mode 100644
index 00000000..14fdac4e
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.lm3s6965evb.ld
@@ -0,0 +1,7 @@
+MEMORY
+{
+ FLASH (rx) : ORIGIN = 0x00000000, LENGTH = 256K
+ RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 64K
+}
+
+INCLUDE linkscript.std.ld
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.qemucm3.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.qemucm3.ld
new file mode 100644
index 00000000..28264674
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.qemucm3.ld
@@ -0,0 +1,8 @@
+MEMORY
+{
+ FLASH (rx) : ORIGIN = 0x00000000, LENGTH = 128K
+ RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 20K
+}
+
+INCLUDE linkscript.std.ld
+
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.std.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.std.ld
new file mode 100644
index 00000000..c08d7bea
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.std.ld
@@ -0,0 +1,172 @@
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions FLASH and RAM.
+ * It references following symbols, which must be defined in code:
+ * Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ * __exidx_start
+ * __exidx_end
+ * __copy_table_start__
+ * __copy_table_end__
+ * __zero_table_start__
+ * __zero_table_end__
+ * __etext
+ * __data_start__
+ * __preinit_array_start
+ * __preinit_array_end
+ * __init_array_start
+ * __init_array_end
+ * __fini_array_start
+ * __fini_array_end
+ * __data_end__
+ * __bss_start__
+ * __bss_end__
+ * __end__
+ * end
+ * __HeapLimit
+ * __StackLimit
+ * __StackTop
+ * __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+ .text :
+ {
+ KEEP(*(.isr_vector))
+ *(.text*)
+
+ KEEP(*(.init))
+ KEEP(*(.fini))
+
+ /* .ctors */
+ *crtbegin.o(.ctors)
+ *crtbegin?.o(.ctors)
+ *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+ *(SORT(.ctors.*))
+ *(.ctors)
+
+ /* .dtors */
+ *crtbegin.o(.dtors)
+ *crtbegin?.o(.dtors)
+ *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+ *(SORT(.dtors.*))
+ *(.dtors)
+
+ *(.rodata*)
+
+ KEEP(*(.eh_frame*))
+ } > FLASH
+
+ .ARM.extab :
+ {
+ *(.ARM.extab* .gnu.linkonce.armextab.*)
+ } > FLASH
+
+ __exidx_start = .;
+ .ARM.exidx :
+ {
+ *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+ } > FLASH
+ __exidx_end = .;
+
+ /* To copy multiple ROM to RAM sections,
+ * uncomment .copy.table section and,
+ * define __STARTUP_COPY_MULTIPLE in startup_ARMCMx.S */
+
+ .copy.table :
+ {
+ . = ALIGN(4);
+ __copy_table_start__ = .;
+ LONG (__etext)
+ LONG (__data_start__)
+ LONG (__data_end__ - __data_start__)
+ __copy_table_end__ = .;
+ } > FLASH
+
+
+ /* To clear multiple BSS sections,
+ * uncomment .zero.table section and,
+ * define __STARTUP_CLEAR_BSS_MULTIPLE in startup_ARMCMx.S */
+ .zero.table :
+ {
+ . = ALIGN(4);
+ __zero_table_start__ = .;
+ LONG (__bss_start__)
+ LONG (__bss_end__ - __bss_start__)
+ __zero_table_end__ = .;
+ } > FLASH
+
+ __etext = .;
+
+ .data : AT (__etext)
+ {
+ __data_start__ = .;
+ *(vtable)
+ *(.data*)
+
+ . = ALIGN(4);
+ /* preinit data */
+ PROVIDE_HIDDEN (__preinit_array_start = .);
+ KEEP(*(.preinit_array))
+ PROVIDE_HIDDEN (__preinit_array_end = .);
+
+ . = ALIGN(4);
+ /* init data */
+ PROVIDE_HIDDEN (__init_array_start = .);
+ KEEP(*(SORT(.init_array.*)))
+ KEEP(*(.init_array))
+ PROVIDE_HIDDEN (__init_array_end = .);
+
+
+ . = ALIGN(4);
+ /* finit data */
+ PROVIDE_HIDDEN (__fini_array_start = .);
+ KEEP(*(SORT(.fini_array.*)))
+ KEEP(*(.fini_array))
+ PROVIDE_HIDDEN (__fini_array_end = .);
+
+ KEEP(*(.jcr*))
+ . = ALIGN(4);
+ /* All data end */
+ __data_end__ = .;
+
+ } > RAM
+
+ .bss :
+ {
+ . = ALIGN(4);
+ __bss_start__ = .;
+ *(.bss*)
+ *(COMMON)
+ . = ALIGN(4);
+ __bss_end__ = .;
+ } > RAM
+
+ .heap (COPY):
+ {
+ __end__ = .;
+ PROVIDE(end = .);
+ *(.heap*)
+ __HeapLimit = .;
+ } > RAM
+
+ /* .stack_dummy section doesn't contains any symbols. It is only
+ * used for linker to calculate size of stack sections, and assign
+ * values to stack symbols later */
+ .stack_dummy (COPY):
+ {
+ *(.stack*)
+ } > RAM
+
+ /* Set stack top to end of RAM, and stack limit move down by
+ * size of stack_dummy section */
+ __StackTop = ORIGIN(RAM) + LENGTH(RAM);
+ __StackLimit = __StackTop - SIZEOF(.stack_dummy);
+ PROVIDE(__stack = __StackTop);
+
+ /* Check if data + heap + stack exceeds RAM limit */
+ ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
+}
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f0.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f0.ld
new file mode 100644
index 00000000..c7a3bd85
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f0.ld
@@ -0,0 +1,8 @@
+MEMORY
+{
+ FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 128K
+ RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 16K
+}
+
+INCLUDE linkscript.std.ld
+
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f1.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f1.ld
new file mode 100644
index 00000000..d13f58de
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f1.ld
@@ -0,0 +1,8 @@
+MEMORY
+{
+ FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 64K
+ RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 8K
+}
+
+INCLUDE linkscript.std.ld
+
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f3.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f3.ld
new file mode 100644
index 00000000..92eee46e
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f3.ld
@@ -0,0 +1,8 @@
+MEMORY
+{
+ FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 64K
+ RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 12K
+}
+
+INCLUDE linkscript.std.ld
+
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/main.c b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/main.c
new file mode 100644
index 00000000..5b7cbf22
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/main.c
@@ -0,0 +1,447 @@
+#ifndef TEST
+# error You must select a function to test.
+#endif
+
+#include "semihost.h"
+#include "aes.h"
+#include "hmac.h"
+#include "sha2.h"
+#include "sha3.h"
+#include "modes.h"
+#include "salsa20.h"
+#include "curve25519.h"
+#include "poly1305.h"
+#include "norx.h"
+#include "chacha20poly1305.h"
+
+#include <stdio.h>
+
+typedef void (*measure_fn)(void);
+static uint32_t bracket; /* bracket mode parameter */
+
+static void do_nothing(void)
+{
+}
+
+static void stack_64w(void)
+{
+ volatile uint32_t words[64];
+ words[0] = 0;
+ words[63] = 0;
+ (void) words[63];
+}
+
+static void stack_8w(void)
+{
+ volatile uint32_t words[8];
+ words[0] = 0;
+ words[7] = 0;
+ (void) words[7];
+}
+
+static void hashtest_sha256(void)
+{
+ uint8_t hash[CF_SHA256_HASHSZ];
+ cf_sha256_context ctx;
+ cf_sha256_init(&ctx);
+ cf_sha256_update(&ctx, "", 0);
+ cf_sha256_digest_final(&ctx, hash);
+}
+
+static void hashtest_sha512(void)
+{
+ uint8_t hash[CF_SHA512_HASHSZ];
+ cf_sha512_context ctx;
+ cf_sha512_init(&ctx);
+ cf_sha512_update(&ctx, "", 0);
+ cf_sha512_digest_final(&ctx, hash);
+}
+
+static void hashtest_sha3_256(void)
+{
+ uint8_t hash[CF_SHA3_256_HASHSZ];
+ cf_sha3_context ctx;
+ cf_sha3_256_init(&ctx);
+ cf_sha3_256_update(&ctx, "", 0);
+ cf_sha3_256_digest_final(&ctx, hash);
+}
+
+static void hashtest_sha3_512(void)
+{
+ uint8_t hash[CF_SHA3_512_HASHSZ];
+ cf_sha3_context ctx;
+ cf_sha3_512_init(&ctx);
+ cf_sha3_512_update(&ctx, "", 0);
+ cf_sha3_512_digest_final(&ctx, hash);
+}
+
+static void aes128block_test(void)
+{
+ uint8_t key[16] = { 0 }, block[16] = { 0 };
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, key, sizeof key);
+ cf_aes_encrypt(&ctx, block, block);
+}
+
+static void aes128sched_test(void)
+{
+ uint8_t key[16] = { 0 };
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, key, sizeof key);
+}
+
+static void aes256block_test(void)
+{
+ uint8_t key[32] = { 0 }, block[16] = { 0 };
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, key, sizeof key);
+ cf_aes_encrypt(&ctx, block, block);
+}
+
+static void aes256sched_test(void)
+{
+ uint8_t key[32] = { 0 };
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, key, sizeof key);
+}
+
+static void aes128gcm_test(void)
+{
+ uint8_t key[16] = { 0 };
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, key, sizeof key);
+
+ uint8_t msg[16] = { 0 };
+ uint8_t aad[16] = { 0 };
+ uint8_t nonce[12] = { 0 };
+ uint8_t cipher[16] = { 0 };
+ uint8_t tag[16] = { 0 };
+
+ cf_gcm_encrypt(&cf_aes, &ctx,
+ msg, sizeof msg,
+ aad, sizeof aad,
+ nonce, sizeof nonce,
+ cipher,
+ tag, sizeof tag);
+}
+
+static void aes128eax_test(void)
+{
+ uint8_t key[16] = { 0 };
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, key, sizeof key);
+
+ uint8_t msg[16] = { 0 };
+ uint8_t aad[16] = { 0 };
+ uint8_t nonce[12] = { 0 };
+ uint8_t cipher[16] = { 0 };
+ uint8_t tag[16] = { 0 };
+
+ cf_eax_encrypt(&cf_aes, &ctx,
+ msg, sizeof msg,
+ aad, sizeof aad,
+ nonce, sizeof nonce,
+ cipher,
+ tag, sizeof tag);
+}
+
+static void aes128ccm_test(void)
+{
+ uint8_t key[16] = { 0 };
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, key, sizeof key);
+
+ uint8_t msg[16] = { 0 };
+ uint8_t aad[16] = { 0 };
+ uint8_t nonce[11] = { 0 };
+ uint8_t cipher[16] = { 0 };
+ uint8_t tag[16] = { 0 };
+
+ cf_ccm_encrypt(&cf_aes, &ctx,
+ msg, sizeof msg, 4,
+ aad, sizeof aad,
+ nonce, sizeof nonce,
+ cipher,
+ tag, sizeof tag);
+}
+
+static void salsa20_test(void)
+{
+ uint8_t key[32] = { 0 };
+ uint8_t nonce[8] = { 0 };
+ uint8_t msg[64] = { 0 };
+ uint8_t cipher[64] = { 0 };
+
+ cf_salsa20_ctx ctx;
+ cf_salsa20_init(&ctx, key, sizeof key, nonce);
+ cf_salsa20_cipher(&ctx, msg, cipher, sizeof msg);
+}
+
+static void chacha20_test(void)
+{
+ uint8_t key[32] = { 0 };
+ uint8_t nonce[8] = { 0 };
+ uint8_t msg[64] = { 0 };
+ uint8_t cipher[64] = { 0 };
+
+ cf_chacha20_ctx ctx;
+ cf_chacha20_init(&ctx, key, sizeof key, nonce);
+ cf_chacha20_cipher(&ctx, msg, cipher, sizeof msg);
+}
+
+static void curve25519_test(void)
+{
+ uint8_t secret[32] = { 1 };
+ uint8_t pubkey[32];
+ cf_curve25519_mul_base(pubkey, secret);
+}
+
+static const uint8_t *mac_message = (const uint8_t *) "hello world";
+static const size_t mac_message_len = 11;
+
+static void poly1305_test(void)
+{
+ uint8_t key[32] = { 0 },
+ nonce[16] = { 0 },
+ encnonce[16],
+ mac[16];
+
+ cf_aes_context aes;
+ cf_aes_init(&aes, key, 16);
+ cf_aes_encrypt(&aes, nonce, encnonce);
+
+ cf_poly1305 poly;
+ cf_poly1305_init(&poly, key + 16, encnonce);
+ cf_poly1305_update(&poly, mac_message, mac_message_len);
+ cf_poly1305_finish(&poly, mac);
+}
+
+static void hmacsha256_test(void)
+{
+ uint8_t key[32] = { 0 },
+ mac[32] = { 0 };
+
+ cf_hmac_ctx ctx;
+ cf_hmac_init(&ctx, &cf_sha256, key, sizeof key);
+ cf_hmac_update(&ctx, mac_message, mac_message_len);
+ cf_hmac_finish(&ctx, mac);
+}
+
+static void norx_test(void)
+{
+ uint8_t key[16] = { 0 };
+ uint8_t msg[16] = { 0 };
+ uint8_t aad[16] = { 0 };
+ uint8_t nonce[8] = { 0 };
+ uint8_t cipher[16] = { 0 };
+ uint8_t tag[16] = { 0 };
+
+ cf_norx32_encrypt(key,
+ nonce,
+ aad, sizeof aad,
+ msg, sizeof msg,
+ NULL, 0,
+ cipher,
+ tag);
+}
+
+#ifndef BRACKET_MODE
+# define AEADPERF_LEN 1
+#else
+# define AEADPERF_LEN BRACKET_END
+#endif
+
+static uint8_t aead_msg[AEADPERF_LEN] = { 0 };
+static uint8_t aead_cipher[AEADPERF_LEN] = { 0 };
+static uint8_t aead_aad[16] = { 0 };
+static uint8_t aead_key[32] = { 0 };
+static uint8_t aead_nonce[16] = { 0 };
+static uint8_t aead_tag[16] = { 0 };
+
+static void aeadperf_norx(void)
+{
+ cf_norx32_encrypt(aead_key, aead_nonce,
+ aead_aad, sizeof aead_aad,
+ aead_msg, bracket,
+ NULL, 0,
+ aead_cipher, aead_tag);
+}
+
+static void aeadperf_chacha20poly1305(void)
+{
+ cf_chacha20poly1305_encrypt(aead_key, aead_nonce,
+ aead_aad, sizeof aead_aad,
+ aead_msg, bracket,
+ aead_cipher, aead_tag);
+}
+static void aeadperf_aes128gcm(void)
+{
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, aead_key, 16);
+
+ cf_gcm_encrypt(&cf_aes, &ctx,
+ aead_msg, bracket,
+ aead_aad, sizeof aead_aad,
+ aead_nonce, 12,
+ aead_cipher,
+ aead_tag, 16);
+}
+
+static void aeadperf_aes128ccm(void)
+{
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, aead_key, 16);
+
+ cf_ccm_encrypt(&cf_aes, &ctx,
+ aead_msg, bracket,
+ 4,
+ aead_aad, sizeof aead_aad,
+ aead_nonce, 11,
+ aead_cipher,
+ aead_tag, 16);
+}
+
+static void aeadperf_aes128eax(void)
+{
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, aead_key, 16);
+
+ cf_eax_encrypt(&cf_aes, &ctx,
+ aead_msg, bracket,
+ aead_aad, sizeof aead_aad,
+ aead_nonce, 12,
+ aead_cipher,
+ aead_tag, 16);
+}
+
+static void aeadperf_aes256gcm(void)
+{
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, aead_key, 32);
+
+ cf_gcm_encrypt(&cf_aes, &ctx,
+ aead_msg, bracket,
+ aead_aad, sizeof aead_aad,
+ aead_nonce, 12,
+ aead_cipher,
+ aead_tag, 16);
+}
+
+static void aeadperf_aes256ccm(void)
+{
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, aead_key, 32);
+
+ cf_ccm_encrypt(&cf_aes, &ctx,
+ aead_msg, bracket,
+ 4,
+ aead_aad, sizeof aead_aad,
+ aead_nonce, 11,
+ aead_cipher,
+ aead_tag, 16);
+}
+
+static void aeadperf_aes256eax(void)
+{
+ cf_aes_context ctx;
+ cf_aes_init(&ctx, aead_key, 32);
+
+ cf_eax_encrypt(&cf_aes, &ctx,
+ aead_msg, bracket,
+ aead_aad, sizeof aead_aad,
+ aead_nonce, 12,
+ aead_cipher,
+ aead_tag, 16);
+}
+
+/* Provided by linkscript */
+extern uint32_t __HeapLimit;
+
+#define STACK_MAGIC 0x57ac34df
+
+static __attribute__((noinline)) void clear_stack(void)
+{
+ uint32_t *stack_start = &__HeapLimit;
+ uint32_t ss = 0, *stack_stop = &ss;
+ size_t words = stack_stop - stack_start;
+ for (size_t i = 0; i < words; i++)
+ stack_start[i] = STACK_MAGIC;
+}
+
+static __attribute__((noinline)) uint32_t measure_stack(void)
+{
+ uint32_t *stack_start = &__HeapLimit;
+ uint32_t ss, *stack_stop = &ss;
+ size_t words = stack_stop - stack_start;
+ for (size_t i = 0; i < words; i++)
+ if (stack_start[i] != STACK_MAGIC)
+ return words - i + 4; /* we used 4 words for ourselves, roughly */
+
+ return 0;
+}
+
+static void measure(measure_fn fn)
+{
+ clear_stack();
+ uint32_t start_cycles = reset_cycles();
+ fn();
+ uint32_t end_cycles = get_cycles();
+ uint32_t stack_words = measure_stack();
+
+ emit("cycles = ");
+ emit_uint32(end_cycles - start_cycles);
+ emit("\n");
+ emit("stack = ");
+ emit_uint32(stack_words << 2);
+ emit("\n");
+}
+
+#define STRING_(x) #x
+#define STRING(x) STRING_(x)
+
+int main(void)
+{
+ emit(STRING(TEST) "\n");
+#ifdef BRACKET_MODE
+ for (bracket = BRACKET_START; bracket <= BRACKET_END; bracket += BRACKET_STEP)
+ {
+ emit("bracket = ");
+ emit_uint32(bracket);
+ emit("\n");
+ measure(TEST);
+ }
+#else
+ measure(TEST);
+#endif
+ quit_success();
+
+ (void) bracket;
+ (void) do_nothing;
+ (void) stack_8w;
+ (void) stack_64w;
+ (void) hashtest_sha256;
+ (void) hashtest_sha512;
+ (void) hashtest_sha3_256;
+ (void) hashtest_sha3_512;
+ (void) aes128block_test;
+ (void) aes128sched_test;
+ (void) aes256block_test;
+ (void) aes256sched_test;
+ (void) aes128gcm_test;
+ (void) aes128eax_test;
+ (void) aes128ccm_test;
+ (void) salsa20_test;
+ (void) chacha20_test;
+ (void) curve25519_test;
+ (void) poly1305_test;
+ (void) hmacsha256_test;
+ (void) norx_test;
+ (void) aeadperf_norx;
+ (void) aeadperf_chacha20poly1305;
+ (void) aeadperf_aes128gcm;
+ (void) aeadperf_aes128ccm;
+ (void) aeadperf_aes128eax;
+ (void) aeadperf_aes256gcm;
+ (void) aeadperf_aes256ccm;
+ (void) aeadperf_aes256eax;
+}
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memcpy.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memcpy.s
new file mode 100644
index 00000000..63406fe5
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memcpy.s
@@ -0,0 +1,49 @@
+ .text
+ .syntax unified
+ .global memcpy
+ .func memcpy
+ .thumb_func
+
+memcpy:
+ /* on entry
+ * r0 = targ
+ * r1 = src
+ * r2 = len (bytes)
+ * on exit
+ * r0 = targ (unchanged)
+ */
+ push {r0, r4, lr}
+
+ /* If targ or src are unaligned, drop to byte
+ * processing. */
+ mov r3, r0
+ movs r4, #3
+ orrs r3, r1
+ ands r3, r4
+ bne L_bytewise
+
+ /* Process words */
+L_wordwise:
+ cmp r2, #4
+ blo L_bytewise
+ ldr r4, [r1]
+ adds r1, #4
+ str r4, [r0]
+ adds r0, #4
+ subs r2, #4
+ b L_wordwise
+
+ /* Process bytes */
+L_bytewise:
+ cmp r2, #0
+ beq L_fin
+ ldrb r4, [r1]
+ adds r1, #1
+ strb r4, [r0]
+ adds r0, #1
+ subs r2, #1
+ b L_bytewise
+
+L_fin:
+ pop {r0, r4, pc}
+ .endfunc
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memset.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memset.s
new file mode 100644
index 00000000..a5019667
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memset.s
@@ -0,0 +1,50 @@
+ .text
+ .syntax unified
+ .global memset
+ .func memset
+ .thumb_func
+
+memset:
+ /* on entry
+ * r0 = targ
+ * r1 = value
+ * r2 = len (bytes)
+ * on exit
+ * r0 = targ (unchanged)
+ */
+ push {r0, r4, lr}
+
+ /* If targ is unaligned, drop to byte
+ * processing. */
+ movs r3, #3
+ ands r3, r0
+ bne L_bytewise
+
+ /* Process words */
+ /* Build r4 by repeating r1. */
+ uxtb r4, r1
+ lsls r3, r4, #8
+ orrs r4, r3
+ lsls r3, r4, #16
+ orrs r4, r3
+
+L_wordwise:
+ cmp r2, #4
+ blo L_bytewise
+ str r4, [r0]
+ adds r0, #4
+ subs r2, #4
+ b L_wordwise
+
+ /* Process bytes */
+L_bytewise:
+ cmp r2, #0
+ beq L_fin
+ strb r1, [r0]
+ adds r0, #1
+ subs r2, #1
+ b L_bytewise
+
+L_fin:
+ pop {r0, r4, pc}
+ .endfunc
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/merge.py b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/merge.py
new file mode 100644
index 00000000..71d50895
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/merge.py
@@ -0,0 +1,26 @@
+import sys
+
+def extract_results(results):
+ index = 0
+ while index < len(results):
+ if results[index].startswith('## '):
+ end = results.index('\n', index)
+ yield results[index:end]
+ index += 1
+
+def merge(readme, res):
+ title, table = res[0], res[1:]
+ assert title in readme, 'Section ' + title + ' missing from README.md'
+ secindex = readme.index(title)
+ hdrindex = [i for i in range(secindex, len(readme)) if readme[i].startswith('---------- | ')][0]
+ start = hdrindex - 1
+ end = readme.index('\n', start)
+ table = [t.rstrip() + '\n' for t in table]
+ return readme[:start] + table + readme[end:]
+
+results = sys.stdin.readlines()
+readme = open('../../README.md').readlines()
+
+for res in extract_results(results):
+ readme = merge(readme, res)
+print ''.join(readme).rstrip()
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.efm32.cfg b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.efm32.cfg
new file mode 100644
index 00000000..85af4733
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.efm32.cfg
@@ -0,0 +1,3 @@
+source [find interface/jlink.cfg]
+transport select swd
+source [find target/efm32.cfg]
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f0.cfg b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f0.cfg
new file mode 100644
index 00000000..e9356f75
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f0.cfg
@@ -0,0 +1,3 @@
+source [find interface/stlink-v2.cfg]
+transport select hla_swd
+source [find target/stm32f0x.cfg]
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f1.cfg b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f1.cfg
new file mode 100644
index 00000000..1108ea07
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f1.cfg
@@ -0,0 +1,3 @@
+source [find interface/stlink-v2.cfg]
+transport select hla_swd
+source [find target/stm32f1x.cfg]
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f3.cfg b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f3.cfg
new file mode 100644
index 00000000..de023b84
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f3.cfg
@@ -0,0 +1,3 @@
+source [find interface/stlink-v2.cfg]
+transport select hla_swd
+source [find target/stm32f3x.cfg]
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/report.py b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/report.py
new file mode 100644
index 00000000..718ab24e
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/report.py
@@ -0,0 +1,276 @@
+"""
+Interprets logs from test runs. Outputs ASCII
+tables containing results, json data, etc.
+"""
+
+import json
+import sys
+
+archs = 'stm32f0 stm32f1 stm32f3'.split()
+tests = """
+aes128block_test
+aes256block_test
+aes128sched_test
+aes256sched_test
+hashtest_sha256
+hashtest_sha512
+hashtest_sha3_256
+hashtest_sha3_512
+aes128gcm_test
+aes128eax_test
+aes128ccm_test
+norx_test
+salsa20_test
+chacha20_test
+poly1305_test
+hmacsha256_test
+curve25519_test
+aeadperf_norx
+aeadperf_aes128gcm
+aeadperf_aes128eax
+aeadperf_aes128ccm
+aeadperf_aes256gcm
+aeadperf_aes256eax
+aeadperf_aes256ccm
+aeadperf_chacha20poly1305
+do_nothing
+""".split()
+
+arch_names = dict(
+ stm32f0 = 'Cortex-M0',
+ stm32f1 = 'Cortex-M3',
+ stm32f3 = 'Cortex-M4F'
+ )
+
+base_test = 'do_nothing'
+
+def extract(arch, test):
+ fn = 'run.%s.%s.log' % (test, arch)
+
+ code_size = 0
+ data_size = 0
+ cycle_count = None
+ stack_usage = None
+ brackets = None
+ current_bracket = None
+
+ try:
+ lines = open(fn).readlines()
+ except IOError:
+ return None
+
+ for l in lines:
+ if 'LOAD' in l:
+ parts = l.split()
+ assert len(parts) >= 8
+ assert 'LOAD' == parts[0]
+ if parts[6] == 'RWE':
+ code_size += long(parts[5], 16)
+ if parts[6] == 'RW':
+ data_size += long(parts[5], 16)
+
+ if l.startswith('bracket = '):
+ bracket = long(l.split(' = ')[1].strip(), 16)
+ current_bracket = bracket
+ if brackets is None:
+ brackets = {}
+ brackets[current_bracket] = dict()
+
+ if l.startswith('cycles = '):
+ cycle_count = long(l.split(' = ')[1].strip(), 16)
+ if current_bracket is not None:
+ brackets[current_bracket]['cycle_count'] = cycle_count
+
+ if l.startswith('stack = '):
+ stack_usage = long(l.split(' = ')[1].strip(), 16)
+ if current_bracket is not None:
+ brackets[current_bracket]['stack_usage'] = stack_usage
+
+ return dict(
+ code_size = code_size,
+ data_size = data_size,
+ cycle_count = cycle_count,
+ stack_usage = stack_usage,
+ brackets = brackets
+ )
+
+def print_table(rows):
+ header, rows = rows[0], rows[1:]
+ assert not [True for r in rows if len(r) != len(header)]
+ widths = []
+ for i, h in enumerate(header):
+ widths.append(max([len(h)] + [len(r[i]) for r in rows]))
+
+ def print_row(row):
+ print ' | '.join(c + (' ' * (widths[i] - len(c))) for i, c in enumerate(row))
+
+ print_row(header)
+ print_row(['-' * w for w in widths])
+ for r in rows:
+ print_row(r)
+
+results = {}
+
+for arch in archs:
+ for test in tests:
+ inf = extract(arch, test)
+ if inf:
+ results.setdefault(arch, {})[test] = inf
+
+for arch in results.keys():
+ if base_test not in results[arch]:
+ print 'need', base_test, 'results to report for', arch
+ continue
+
+ base_result = results[arch][base_test]
+
+ for test in results[arch].keys():
+ if test == base_test:
+ continue
+
+ results[arch][test]['code_size'] -= base_result['code_size']
+
+def tabulate_aes(arch, block_result, sched_result, table = None):
+ if table is None:
+ table = []
+ table.append((
+ 'Core',
+ 'Cycles (key schedule + block)',
+ 'Cycles (key schedule)',
+ 'Cycles (block)',
+ 'Stack',
+ 'Code size'
+ ))
+
+ table.append(
+ (
+ arch_names[arch],
+ '%d' % block_result['cycle_count'],
+ '%d' % sched_result['cycle_count'],
+ '%d' % (block_result['cycle_count'] - sched_result['cycle_count']),
+ '%dB' % block_result['stack_usage'],
+ '%dB' % block_result['code_size']
+ ))
+
+ return table
+
+def print_std(result):
+ print """* **Cycles**: %(cycle_count)d
+* **Stack**: %(stack_usage)dB
+* **Code size**: %(code_size)dB
+""" % result
+
+def tabulate_std(arch, result, table = None):
+ if table is None:
+ table = []
+ table.append(('Core', 'Cycles', 'Stack', 'Code size'))
+
+ table.append(
+ (
+ arch_names[arch],
+ '%d' % result['cycle_count'],
+ '%dB' % result['stack_usage'],
+ '%dB' % result['code_size']
+ ))
+
+ return table
+
+def tabulate(mktab):
+ table = None
+ for arch in archs:
+ if arch not in results:
+ continue
+ table = mktab(arch, table)
+ print_table(table)
+
+def convert_brackets(metric, tests):
+ for arch in archs:
+ arch_result = {}
+
+ # collect results for each test
+ for t in tests:
+ if arch not in results or t not in results[arch]:
+ print 'missing', arch, t
+ continue
+ data = results[arch][t]['brackets']
+ arch_result[t] = [[b, data[b][metric]] for b in sorted(data.keys())]
+
+ # convert into list of [bracket, test-1, test-2, ...] lists
+ out = []
+ if len(arch_result) == 0:
+ continue
+ first_row = arch_result.values()[0]
+
+ for i in range(len(first_row)):
+ row = [ first_row[i][0] ]
+
+ for k in sorted(arch_result.keys()):
+ if len(arch_result[k]) != len(first_row):
+ print 'warn:', 'test', k, 'did not complete?'
+ rr = arch_result[k][i]
+ row.append(rr[1])
+
+ out.append(row)
+
+ print json.dumps(out)
+
+convert_brackets('cycle_count',
+ [
+ 'aeadperf_norx',
+ 'aeadperf_aes128gcm',
+ 'aeadperf_aes128eax',
+ 'aeadperf_aes128ccm',
+ 'aeadperf_aes256gcm',
+ 'aeadperf_aes256eax',
+ 'aeadperf_aes256ccm',
+ 'aeadperf_chacha20poly1305'
+ ])
+convert_brackets('stack_usage',
+ [
+ 'aeadperf_norx',
+ 'aeadperf_aes128gcm',
+ 'aeadperf_aes128eax',
+ 'aeadperf_aes128ccm',
+ 'aeadperf_aes256gcm',
+ 'aeadperf_aes256eax',
+ 'aeadperf_aes256ccm',
+ 'aeadperf_chacha20poly1305'
+ ])
+
+# screwed if we need other block ciphers
+print '###', '128-bit key'
+tabulate(lambda arch, table: tabulate_aes(arch, results[arch]['aes128block_test'], results[arch]['aes128sched_test'], table))
+print
+
+print '###', '256-bit key'
+tabulate(lambda arch, table: tabulate_aes(arch, results[arch]['aes256block_test'], results[arch]['aes256sched_test'], table))
+print
+
+def do_table(title, test):
+ print '##', title
+ tabulate(lambda arch, table: tabulate_std(arch, results[arch][test], table))
+ print
+
+do_table('AES128-GCM', 'aes128gcm_test')
+do_table('AES128-EAX', 'aes128eax_test')
+do_table('AES128-CCM', 'aes128ccm_test')
+do_table('NORX32', 'norx_test')
+do_table('ChaCha20', 'chacha20_test')
+do_table('Salsa20', 'salsa20_test')
+do_table('SHA256', 'hashtest_sha256')
+do_table('SHA512', 'hashtest_sha512')
+do_table('SHA3-256', 'hashtest_sha3_256')
+do_table('SHA3-512', 'hashtest_sha3_512')
+do_table('HMAC-SHA256', 'hmacsha256_test')
+do_table('Poly1305-AES', 'poly1305_test')
+do_table('Curve25519', 'curve25519_test')
+
+if '--aead' in sys.argv:
+ do_table('AEAD-Shootout: NORX', 'aeadperf_norx')
+ do_table('AEAD-Shootout: AES-128-GCM', 'aeadperf_aes128gcm')
+ do_table('AEAD-Shootout: AES-128-EAX', 'aeadperf_aes128eax')
+ do_table('AEAD-Shootout: AES-128-CCM', 'aeadperf_aes128ccm')
+ do_table('AEAD-Shootout: AES-256-GCM', 'aeadperf_aes256gcm')
+ do_table('AEAD-Shootout: AES-256-EAX', 'aeadperf_aes256eax')
+ do_table('AEAD-Shootout: AES-256-CCM', 'aeadperf_aes256ccm')
+ do_table('AEAD-Shootout: ChaCha20-Poly1305', 'aeadperf_chacha20poly1305')
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.c b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.c
new file mode 100644
index 00000000..cbe5aa2e
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.c
@@ -0,0 +1,170 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+
+#include "semihost.h"
+
+#define OP_WRITE0 0x04
+#define OP_EXIT 0x18
+#define OP_EXIT_ARG_FAILURE 0x0
+#define OP_EXIT_ARG_SUCCESS 0x20026
+
+extern uint32_t semihost(uint32_t, volatile void *);
+
+__attribute__((noreturn))
+void quit_success(void)
+{
+ semihost(OP_EXIT, (void *) OP_EXIT_ARG_SUCCESS);
+ while (1)
+ ;
+}
+
+__attribute__((noreturn))
+void quit_failure(void)
+{
+ semihost(OP_EXIT, (void *) OP_EXIT_ARG_FAILURE);
+ while (1)
+ ;
+}
+
+void emit(const char *buf)
+{
+ semihost(OP_WRITE0, (volatile void *) buf);
+}
+
+static void emit_extent(const char *start, const char *end)
+{
+ char buf[32+1];
+ size_t bufmax = sizeof(buf) - 1;
+ buf[32] = 0;
+
+ size_t bytes = end - start + 1;
+
+ while (bytes >= bufmax)
+ {
+ memcpy(buf, start, bufmax);
+ emit(buf);
+ bytes -= bufmax;
+ start += bufmax;
+ }
+
+ if (bytes == 0)
+ return;
+
+ memcpy(buf, start, bytes);
+ buf[bytes] = 0;
+ emit(buf);
+}
+
+void emitf(const char *fmt, ...)
+{
+ const char *start = fmt, *end = fmt;
+
+ va_list args;
+ va_start(args, fmt);
+
+ while (*fmt)
+ {
+ switch (*fmt)
+ {
+ case '%':
+ emit_extent(start, end);
+
+ switch (fmt[1])
+ {
+ case '%':
+ emit("%");
+ break;
+
+ case 'u':
+ emit_uint32(va_arg(args, uint32_t));
+ break;
+
+ case 's':
+ emit(va_arg(args, const char *));
+ break;
+ }
+ start = end = fmt + 2;
+ break;
+
+ default:
+ end = fmt;
+ break;
+ }
+
+ fmt++;
+ }
+
+ va_end(args);
+ emit_extent(start, end);
+}
+
+static const char *hex_chars = "0123456789abcdef";
+
+void emit_hex(const void *ptr, size_t len)
+{
+ const uint8_t *bb = ptr;
+ char byte[3];
+
+ byte[2] = 0;
+
+ for (size_t i = 0; i < len; i++)
+ {
+ byte[0] = hex_chars[(bb[i] >> 4) & 0xf];
+ byte[1] = hex_chars[bb[i] & 0xf];
+ emit(byte);
+ }
+}
+
+void emit_uint32(uint32_t x)
+{
+ char buf[sizeof "0x11223344"];
+ buf[0] = '0';
+ buf[1] = 'x';
+ buf[2] = hex_chars[(x >> 28) & 0xf];
+ buf[3] = hex_chars[(x >> 24) & 0xf];
+ buf[4] = hex_chars[(x >> 20) & 0xf];
+ buf[5] = hex_chars[(x >> 16) & 0xf];
+ buf[6] = hex_chars[(x >> 12) & 0xf];
+ buf[7] = hex_chars[(x >> 8) & 0xf];
+ buf[8] = hex_chars[(x >> 4) & 0xf];
+ buf[9] = hex_chars[x & 0xf];
+ buf[10] = 0;
+
+ emit(buf);
+}
+
+typedef struct
+{
+ volatile uint32_t ctrl;
+ volatile uint32_t reload;
+ volatile uint32_t current;
+} systick;
+
+#define SysTick ((systick *)0xe000e010)
+
+#define STCTRL_SYSCLOCK 0x04
+#define STCTRL_TICKINT 0x02
+#define STCTRL_ENABLE 0x01
+
+#define STCTRL_MAX 0xffffff
+#define STCTRL_SHIFT 24
+
+extern uint32_t get_ticks(void);
+extern void reset_ticks(void);
+
+uint32_t reset_cycles(void)
+{
+ SysTick->reload = STCTRL_MAX;
+ SysTick->ctrl = STCTRL_SYSCLOCK | STCTRL_TICKINT | STCTRL_ENABLE;
+ SysTick->current = 0;
+ reset_ticks();
+ return get_ticks();
+}
+
+uint32_t get_cycles(void)
+{
+ return (get_ticks() << STCTRL_SHIFT) + (STCTRL_MAX - SysTick->current);
+}
+
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.h b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.h
new file mode 100644
index 00000000..cf6f01a5
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.h
@@ -0,0 +1,40 @@
+#ifndef SEMIHOST_H
+#define SEMIHOST_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Exits emulator with success (or merely hangs). */
+__attribute__((noreturn))
+void quit_success(void);
+
+/* Exits emulator with failure (or merely hangs). */
+__attribute__((noreturn))
+void quit_failure(void);
+
+/* Writes zero terminated string to debug output */
+void emit(const char *buf);
+
+/* Writes a formatting string to debug output.
+ *
+ * Supported:
+ * %u - uint32_t argument, same as emit_uint32
+ * %s - const char * argument, same as emit
+ */
+void emitf(const char *fmt, ...);
+
+/* Writes hex dump of len bytes at ptr to debug output. */
+void emit_hex(const void *ptr, size_t len);
+
+/* Writes value v in hex to debug output, in format:
+ * 0xHHHHHHHH (equivalent to printf 0x%08x). */
+void emit_uint32(uint32_t v);
+
+/* Reset cycle counter to 0. Returns the current value
+ * (just after resetting it). */
+uint32_t reset_cycles(void);
+
+/* Return the value of the cycle counter. */
+uint32_t get_cycles(void);
+
+#endif
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.s
new file mode 100644
index 00000000..0fddf045
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.s
@@ -0,0 +1,15 @@
+ .text
+ .syntax unified
+ .global semihost
+ .func semihost
+ .thumb_func
+
+semihost:
+ /* on entry
+ * r0 = op
+ * r1 = arg */
+ push {r7, lr}
+ bkpt 0xab
+ pop {r7, pc}
+
+ .endfunc
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_mpy121666.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_mpy121666.s
new file mode 100644
index 00000000..49e3b5d0
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_mpy121666.s
@@ -0,0 +1,199 @@
+// Implementation of multiplication of an fe25519 bit value with the curve constant 121666.
+//
+// B. Haase, Endress + Hauser Conducta GmbH & Ko. KG
+// public domain.
+//
+// gnu assembler format.
+//
+// Generated and tested with C++ functions in the test subdirectory.
+//
+// ATTENTION:
+// Not yet tested on target hardware.
+
+
+ .cpu cortex-m0
+ .fpu softvfp
+ .eabi_attribute 20, 1
+ .eabi_attribute 21, 1
+ .eabi_attribute 23, 3
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+ .eabi_attribute 26, 1
+ .eabi_attribute 30, 2
+ .eabi_attribute 34, 0
+ .eabi_attribute 18, 4
+ .code 16
+
+ .file "cortex_m0_reduce25519.s"
+
+ .text
+ .align 2
+
+ .global fe25519_mpyWith121666_asm
+ .code 16
+ .thumb_func
+ .type fe25519_mpyWith121666_asm, %function
+
+fe25519_mpyWith121666_asm:
+ push {r4,r5,r6,r7,r14}
+ ldr r7,__label_for_immediate_56130
+ ldr r2,[r1,#28]
+ lsl r5,r2,#16
+ lsr r6,r2,#16
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r5,r2
+ mov r2,#0
+ adc r6,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r5,r2
+ adc r6,r3
+ lsl r2,r5,#1
+ lsr r2,r2,#1
+ str r2,[r0,#28]
+ lsr r5,r5,#31
+ lsl r6,r6,#1
+ orr r5,r6
+ mov r6,#19
+ mul r5,r6
+ mov r6,#0
+ ldr r2,[r1,#0]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r5,r3
+ adc r6,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r5,r2
+ mov r2,#0
+ adc r6,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r5,r2
+ adc r6,r3
+ str r5,[r0,#0]
+ mov r5,#0
+ ldr r2,[r1,#4]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r6,r3
+ adc r5,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r6,r2
+ mov r2,#0
+ adc r5,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r6,r2
+ adc r5,r3
+ str r6,[r0,#4]
+ mov r6,#0
+ ldr r2,[r1,#8]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r5,r3
+ adc r6,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r5,r2
+ mov r2,#0
+ adc r6,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r5,r2
+ adc r6,r3
+ str r5,[r0,#8]
+ mov r5,#0
+ ldr r2,[r1,#12]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r6,r3
+ adc r5,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r6,r2
+ mov r2,#0
+ adc r5,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r6,r2
+ adc r5,r3
+ str r6,[r0,#12]
+ mov r6,#0
+ ldr r2,[r1,#16]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r5,r3
+ adc r6,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r5,r2
+ mov r2,#0
+ adc r6,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r5,r2
+ adc r6,r3
+ str r5,[r0,#16]
+ mov r5,#0
+ ldr r2,[r1,#20]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r6,r3
+ adc r5,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r6,r2
+ mov r2,#0
+ adc r5,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r6,r2
+ adc r5,r3
+ str r6,[r0,#20]
+ mov r6,#0
+ ldr r2,[r1,#24]
+ lsl r3,r2,#16
+ lsr r4,r2,#16
+ add r5,r3
+ adc r6,r4
+ lsr r3,r2,#16
+ uxth r2,r2
+ mul r2,r7
+ mul r3,r7
+ add r5,r2
+ mov r2,#0
+ adc r6,r2
+ lsl r2,r3,#16
+ lsr r3,r3,#16
+ add r5,r2
+ adc r6,r3
+ str r5,[r0,#24]
+ mov r5,#0
+ ldr r2,[r0,#28]
+ add r6,r2
+ str r6,[r0,#28]
+ pop {r4,r5,r6,r7,r15}
+
+ .align 2
+__label_for_immediate_56130:
+ .word 56130
+
+ .size fe25519_mpyWith121666_asm, .-fe25519_mpyWith121666_asm
+
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_reduce25519.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_reduce25519.s
new file mode 100644
index 00000000..4c09f5ea
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_reduce25519.s
@@ -0,0 +1,176 @@
+// Implementation of a partial reduction modulo 2^255 - 38.
+//
+// B. Haase, Endress + Hauser Conducta GmbH & Ko. KG
+// public domain.
+//
+// gnu assembler format.
+//
+// Generated and tested with C++ functions in the test subdirectory and on the target.
+//
+
+ .cpu cortex-m0
+ .fpu softvfp
+ .eabi_attribute 20, 1
+ .eabi_attribute 21, 1
+ .eabi_attribute 23, 3
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+ .eabi_attribute 26, 1
+ .eabi_attribute 30, 2
+ .eabi_attribute 34, 0
+ .eabi_attribute 18, 4
+ .code 16
+
+ .file "cortex_m0_reduce25519.s"
+
+ .text
+ .align 2
+
+ .global fe25519_reduceTo256Bits_asm
+ .code 16
+ .thumb_func
+ .type fe25519_reduceTo256Bits_asm, %function
+
+fe25519_reduceTo256Bits_asm:
+ push {r4,r5,r6,r7,r14}
+ ldr r2,[r1,#60]
+ lsr r3,r2,#16
+ uxth r2,r2
+ mov r7,#38
+ mul r2,r7
+ mul r3,r7
+ ldr r4,[r1,#28]
+ lsr r5,r3,#16
+ lsl r3,r3,#16
+ mov r6,#0
+ add r4,r2
+ adc r5,r6
+ add r4,r3
+ adc r5,r6
+ lsl r2,r4,#1
+ lsr r2,r2,#1
+ str r2,[r0,#28]
+ lsr r4,r4,#31
+ lsl r5,r5,#1
+ orr r4,r5
+ mov r2,#19
+ mul r2,r4
+ ldr r4,[r1,#0]
+ add r2,r4
+ mov r3,#0
+ adc r3,r6
+ ldr r4,[r1,#32]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r2,r4
+ adc r3,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ str r2,[r0,#0]
+ ldr r4,[r1,#4]
+ add r3,r4
+ mov r2,#0
+ adc r2,r6
+ ldr r4,[r1,#36]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r3,r4
+ adc r2,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r3,r4
+ adc r2,r5
+ str r3,[r0,#4]
+ ldr r4,[r1,#8]
+ add r2,r4
+ mov r3,#0
+ adc r3,r6
+ ldr r4,[r1,#40]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r2,r4
+ adc r3,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ str r2,[r0,#8]
+ ldr r4,[r1,#12]
+ add r3,r4
+ mov r2,#0
+ adc r2,r6
+ ldr r4,[r1,#44]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r3,r4
+ adc r2,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r3,r4
+ adc r2,r5
+ str r3,[r0,#12]
+ ldr r4,[r1,#16]
+ add r2,r4
+ mov r3,#0
+ adc r3,r6
+ ldr r4,[r1,#48]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r2,r4
+ adc r3,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ str r2,[r0,#16]
+ ldr r4,[r1,#20]
+ add r3,r4
+ mov r2,#0
+ adc r2,r6
+ ldr r4,[r1,#52]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r3,r4
+ adc r2,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r3,r4
+ adc r2,r5
+ str r3,[r0,#20]
+ ldr r4,[r1,#24]
+ add r2,r4
+ mov r3,#0
+ adc r3,r6
+ ldr r4,[r1,#56]
+ lsr r5,r4,#16
+ uxth r4,r4
+ mul r5,r7
+ mul r4,r7
+ add r2,r4
+ adc r3,r6
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ str r2,[r0,#24]
+ ldr r4,[r0,#28]
+ add r4,r3
+ str r4,[r0,#28]
+ pop {r4,r5,r6,r7,r15}
+
+ .size fe25519_reduceTo256Bits_asm, .-fe25519_reduceTo256Bits_asm
+
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/mul.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/mul.s
new file mode 100644
index 00000000..155674c6
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/mul.s
@@ -0,0 +1,1109 @@
+ .align 2
+ .global multiply256x256_asm
+ .type multiply256x256_asm, %function
+multiply256x256_asm:
+ push {r4-r7,lr}
+ mov r3, r8
+ mov r4, r9
+ mov r5, r10
+ mov r6, r11
+ push {r0-r6}
+ mov r12, r0
+ mov r10, r2
+ mov r11, r1
+ mov r0,r2
+ ldm r0!, {r4,r5,r6,r7}
+ ldm r1!, {r2,r3,r6,r7}
+ push {r0,r1}
+ /////////BEGIN LOW PART //////////////////////
+ /////////MUL128/////////////
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ //////////////////////////
+ mov r4, r12
+ stm r4!, {r0,r1}
+ push {r4}
+ push {r0,r1}
+ mov r1, r10
+ mov r10, r2
+ ldm r1, {r0, r1, r4, r5}
+ mov r2, r4
+ mov r7, r5
+ sub r2, r0
+ sbc r7, r1
+ sbc r6, r6
+ eor r2, r6
+ eor r7, r6
+ sub r2, r6
+ sbc r7, r6
+ push {r2, r7}
+ mov r2, r11
+ mov r11, r3
+ ldm r2, {r0, r1, r2, r3}
+ sub r0, r2
+ sbc r1, r3
+ sbc r7, r7
+ eor r0, r7
+ eor r1, r7
+ sub r0, r7
+ sbc r1, r7
+ eor r7, r6
+ mov r12, r7
+ push {r0, r1}
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ mov r4, r10
+ mov r5, r11
+ eor r6, r6
+ add r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r6
+ mov r10, r2
+ mov r11, r3
+ pop {r2-r5}
+ push {r0, r1}
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ pop {r4, r5}
+ mov r6, r12
+ mov r7, r12
+ eor r0, r6
+ eor r1, r6
+ eor r2, r6
+ eor r3, r6
+ asr r6, r6, #1
+ adc r0, r4
+ adc r1, r5
+ adc r4, r2
+ adc r5, r3
+ eor r2, r2
+ adc r6,r2
+ adc r7,r2
+ pop {r2, r3}
+ mov r8, r2
+ mov r9, r3
+ add r2, r0
+ adc r3, r1
+ mov r0, r10
+ mov r1, r11
+ adc r4, r0
+ adc r5, r1
+ adc r6, r0
+ adc r7, r1
+ ////////END LOW PART/////////////////////
+ pop {r0}
+ stm r0!, {r2,r3}
+ pop {r1,r2}
+ push {r0}
+ push {r4-r7}
+ mov r10, r1
+ mov r11, r2
+ ldm r1!, {r4, r5}
+ ldm r2, {r2, r3}
+ /////////BEGIN HIGH PART////////////////
+ /////////MUL128/////////////
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ push {r0,r1}
+ mov r1, r10
+ mov r10, r2
+ ldm r1, {r0, r1, r4, r5}
+ mov r2, r4
+ mov r7, r5
+ sub r2, r0
+ sbc r7, r1
+ sbc r6, r6
+ eor r2, r6
+ eor r7, r6
+ sub r2, r6
+ sbc r7, r6
+ push {r2, r7}
+ mov r2, r11
+ mov r11, r3
+ ldm r2, {r0, r1, r2, r3}
+ sub r0, r2
+ sbc r1, r3
+ sbc r7, r7
+ eor r0, r7
+ eor r1, r7
+ sub r0, r7
+ sbc r1, r7
+ eor r7, r6
+ mov r12, r7
+ push {r0, r1}
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ mov r4, r10
+ mov r5, r11
+ eor r6, r6
+ add r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r6
+ mov r10, r2
+ mov r11, r3
+ pop {r2-r5}
+ push {r0, r1}
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ pop {r4, r5}
+ mov r6, r12
+ mov r7, r12
+ eor r0, r6
+ eor r1, r6
+ eor r2, r6
+ eor r3, r6
+ asr r6, r6, #1
+ adc r0, r4
+ adc r1, r5
+ adc r4, r2
+ adc r5, r3
+ eor r2, r2
+ adc r6,r2 //0,1
+ adc r7,r2
+ pop {r2, r3}
+ mov r8, r2
+ mov r9, r3
+ add r2, r0
+ adc r3, r1
+ mov r0, r10
+ mov r1, r11
+ adc r4, r0
+ adc r5, r1
+ adc r6, r0
+ adc r7, r1
+ ////////END HIGH PART/////////////////////
+ mov r0, r8
+ mov r1, r9
+ mov r8, r6
+ mov r9, r7
+ pop {r6, r7}
+ add r0, r6
+ adc r1, r7
+ pop {r6, r7}
+ adc r2, r6
+ adc r3, r7
+ pop {r7}
+ stm r7!, {r0-r3}
+ mov r10, r7
+ eor r0,r0
+ mov r6, r8
+ mov r7, r9
+ adc r4, r0
+ adc r5, r0
+ adc r6, r0
+ adc r7, r0
+ pop {r0,r1,r2}
+ mov r12, r2
+ push {r0, r4-r7}
+ ldm r1, {r0-r7}
+ sub r0, r4
+ sbc r1, r5
+ sbc r2, r6
+ sbc r3, r7
+ eor r4, r4
+ sbc r4, r4
+ eor r0, r4
+ eor r1, r4
+ eor r2, r4
+ eor r3, r4
+ sub r0, r4
+ sbc r1, r4
+ sbc r2, r4
+ sbc r3, r4
+ mov r6, r12
+ mov r12, r4 //carry
+ mov r5, r10
+ stm r5!, {r0-r3}
+ mov r11, r5
+ mov r8, r0
+ mov r9, r1
+ ldm r6, {r0-r7}
+ sub r4, r0
+ sbc r5, r1
+ sbc r6, r2
+ sbc r7, r3
+ eor r0, r0
+ sbc r0, r0
+ eor r4, r0
+ eor r5, r0
+ eor r6, r0
+ eor r7, r0
+ sub r4, r0
+ sbc r5, r0
+ sbc r6, r0
+ sbc r7, r0
+ mov r1, r12
+ eor r0, r1
+ mov r1, r11
+ stm r1!, {r4-r7}
+ push {r0}
+ mov r2, r8
+ mov r3, r9
+ /////////BEGIN MIDDLE PART////////////////
+ /////////MUL128/////////////
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ push {r0,r1}
+ mov r1, r10
+ mov r10, r2
+ ldm r1, {r0, r1, r4, r5}
+ mov r2, r4
+ mov r7, r5
+ sub r2, r0
+ sbc r7, r1
+ sbc r6, r6
+ eor r2, r6
+ eor r7, r6
+ sub r2, r6
+ sbc r7, r6
+ push {r2, r7}
+ mov r2, r11
+ mov r11, r3
+ ldm r2, {r0, r1, r2, r3}
+ sub r0, r2
+ sbc r1, r3
+ sbc r7, r7
+ eor r0, r7
+ eor r1, r7
+ sub r0, r7
+ sbc r1, r7
+ eor r7, r6
+ mov r12, r7
+ push {r0, r1}
+ //MUL64
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ mov r4, r10
+ mov r5, r11
+ eor r6, r6
+ add r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r6
+ mov r10, r2
+ mov r11, r3
+ pop {r2-r5}
+ push {r0, r1}
+ mov r6, r5
+ mov r1, r2
+ sub r5, r4
+ sbc r0, r0
+ eor r5, r0
+ sub r5, r0
+ sub r1, r3
+ sbc r7, r7
+ eor r1, r7
+ sub r1, r7
+ eor r7, r0
+ mov r9, r1
+ mov r8, r5
+ lsr r1,r4,#16
+ uxth r4,r4
+ mov r0,r4
+ uxth r5,r2
+ lsr r2,#16
+ mul r0,r5//00
+ mul r5,r1//10
+ mul r4,r2//01
+ mul r1,r2//11
+ lsl r2,r4,#16
+ lsr r4,r4,#16
+ add r0,r2
+ adc r1,r4
+ lsl r2,r5,#16
+ lsr r4,r5,#16
+ add r0,r2
+ adc r1,r4
+ lsr r4, r6,#16
+ uxth r6, r6
+ uxth r5, r3
+ lsr r3, r3, #16
+ mov r2, r6
+ mul r2, r5
+ mul r5, r4
+ mul r6, r3
+ mul r3, r4
+ lsl r4,r5,#16
+ lsr r5,r5,#16
+ add r2,r4
+ adc r3,r5
+ lsl r4,r6,#16
+ lsr r5,r6,#16
+ add r2,r4
+ adc r3,r5
+ eor r6, r6
+ add r2, r1
+ adc r3, r6
+ mov r1, r9
+ mov r5, r8
+ mov r8, r0
+ lsr r0, r1,#16
+ uxth r1,r1
+ mov r4,r1
+ lsr r6,r5,#16
+ uxth r5,r5
+ mul r1,r5
+ mul r4,r6
+ mul r5,r0
+ mul r0,r6
+ lsl r6,r4,#16
+ lsr r4,#16
+ add r1,r6
+ adc r0,r4
+ lsl r6,r5,#16
+ lsr r5,#16
+ add r1,r6
+ adc r0,r5
+ eor r1,r7
+ eor r0,r7
+ eor r4, r4
+ asr r7, r7, #1
+ adc r1, r2
+ adc r2, r0
+ adc r7, r4
+ mov r0, r8
+ add r1, r0
+ adc r2, r3
+ adc r3, r7
+ pop {r4, r5}
+ mov r6, r12
+ mov r7, r12
+ eor r0, r6
+ eor r1, r6
+ eor r2, r6
+ eor r3, r6
+ asr r6, r6, #1
+ adc r0, r4
+ adc r1, r5
+ adc r4, r2
+ adc r5, r3
+ eor r2, r2
+ adc r6,r2 //0,1
+ adc r7,r2
+ pop {r2, r3}
+ mov r8, r2
+ mov r9, r3
+ add r2, r0
+ adc r3, r1
+ mov r0, r10
+ mov r1, r11
+ adc r4, r0
+ adc r5, r1
+ adc r6, r0
+ adc r7, r1
+ //////////END MIDDLE PART////////////////
+ pop {r0,r1} //r0,r1
+ mov r12, r0 //negative
+ eor r2, r0
+ eor r3, r0
+ eor r4, r0
+ eor r5, r0
+ eor r6, r0
+ eor r7, r0
+ push {r4-r7}
+ ldm r1!, {r4-r7}
+ mov r11, r1 //reference
+ mov r1, r9
+ eor r1, r0
+ mov r10, r4
+ mov r4, r8
+ asr r0, #1
+ eor r0, r4
+ mov r4, r10
+ adc r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r7
+ eor r4, r4
+ adc r4, r4
+ mov r10, r4 //carry
+ mov r4, r11
+ ldm r4, {r4-r7}
+ add r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r7
+ mov r9, r4
+ mov r4, r11
+ stm r4!, {r0-r3}
+ mov r11, r4
+ pop {r0-r3}
+ mov r4, r9
+ adc r4, r0
+ adc r5, r1
+ adc r6, r2
+ adc r7, r3
+ mov r1, #0
+ adc r1, r1
+ mov r0, r10
+ mov r10, r1 //carry
+ asr r0, #1
+ pop {r0-r3}
+ adc r4, r0
+ adc r5, r1
+ adc r6, r2
+ adc r7, r3
+ mov r8, r0
+ mov r0, r11
+ stm r0!, {r4-r7}
+ mov r11, r0
+ mov r0, r8
+ mov r6, r12
+ mov r5, r10
+ eor r4, r4
+ adc r5, r6
+ adc r6, r4
+ add r0, r5
+ adc r1, r6
+ adc r2, r6
+ adc r3, r6
+ mov r7, r11
+ stm r7!, {r0-r3}
+ pop {r3-r6}
+ mov r8, r3
+ mov r9, r4
+ mov r10, r5
+ mov r11, r6
+ pop {r4-r7,pc}
+ bx lr
+.size multiply256x256_asm, .-multiply256x256_asm
+
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/scalarmult.c b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/scalarmult.c
new file mode 100644
index 00000000..488aac78
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/scalarmult.c
@@ -0,0 +1,761 @@
+/* =======================
+ ============================ C/C++ HEADER FILE =============================
+ =======================
+
+ Collection of all required submodules from naclM0 required for curve25519
+ scalar multiplication (not including randomization, etc.) alone.
+
+ Library naclM0 largely bases on work avrNacl of M. Hutter and P. Schwabe.
+
+ Will compile to the two functions
+
+ int
+ crypto_scalarmult_base_curve25519(
+ unsigned char* q,
+ const unsigned char* n
+ );
+
+ int
+ crypto_scalarmult_curve25519 (
+ unsigned char* r,
+ const unsigned char* s,
+ const unsigned char* p
+ );
+
+ Requires inttypes.h header and the four external assembly functions
+
+ extern void
+ fe25519_reduceTo256Bits_asm (
+ fe25519 *res,
+ const UN_512bitValue *in
+ );
+
+ extern void
+ fe25519_mpyWith121666_asm (
+ fe25519* out,
+ const fe25519* in
+ );
+
+ extern void
+ multiply256x256_asm (
+ UN_512bitValue* result,
+ const UN_256bitValue* x,
+ const UN_256bitValue* y
+ );
+
+ extern void
+ square256_asm (
+ UN_512bitValue* result,
+ const UN_256bitValue* x
+ );
+
+ \file scalarmult.c
+
+ \Author B. Haase, Endress + Hauser Conducta GmbH & Co. KG
+
+ License: CC Common Creative license Attribution 4.0 International (CC BY 4.0)
+ http://creativecommons.org/licenses/by/4.0/
+ ============================================================================*/
+
+#include <inttypes.h>
+
+// comment out this line if implementing conditional swaps by data moves
+//#define DH_SWAP_BY_POINTERS
+
+// Define the symbol to 0 in order to only use ladder steps
+//#define DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS 1
+
+typedef uint8_t uint8;
+typedef uint16_t uint16;
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+typedef uintptr_t uintptr;
+
+typedef int8_t int8;
+typedef int16_t int16;
+typedef int32_t int32;
+typedef int64_t int64;
+typedef intptr_t intptr;
+
+// Note that it's important to define the unit8 as first union member, so that
+// an array of uint8 may be used as initializer.
+typedef union UN_256bitValue_
+{
+ uint8 as_uint8[32];
+ uint16 as_uint16[16];
+ uint32 as_uint32[8];
+ uint64 as_uint64[4];
+} UN_256bitValue;
+
+// Note that it's important to define the unit8 as first union member, so that
+// an array of uint8 may be used as initializer.
+typedef union UN_512bitValue_
+{
+ uint8 as_uint8[64];
+ uint16 as_uint16[32];
+ uint32 as_uint32[16];
+ uint64 as_uint64[8];
+ UN_256bitValue as_256_bitValue[2];
+} UN_512bitValue;
+
+typedef UN_256bitValue fe25519;
+
+// ****************************************************
+// Assembly functions.
+// ****************************************************
+
+extern void
+fe25519_reduceTo256Bits_asm(
+ fe25519 *res,
+ const UN_512bitValue *in
+);
+
+#define fe25519_mpyWith121666 fe25519_mpyWith121666_asm
+extern void
+fe25519_mpyWith121666_asm (
+ fe25519* out,
+ const fe25519* in
+);
+
+#define multiply256x256 multiply256x256_asm
+extern void
+multiply256x256(
+ UN_512bitValue* result,
+ const UN_256bitValue* x,
+ const UN_256bitValue* y
+);
+
+#define square256 square256_asm
+extern void
+square256(
+ UN_512bitValue* result,
+ const UN_256bitValue* x
+);
+
+// ****************************************************
+// C functions for fe25519
+// ****************************************************
+
+static void
+fe25519_cpy(
+ fe25519* dest,
+ const fe25519* source
+)
+{
+ uint32 ctr;
+
+ for (ctr = 0; ctr < 8; ctr++)
+ {
+ dest->as_uint32[ctr] = source->as_uint32[ctr];
+ }
+}
+
+static void
+fe25519_unpack(
+ volatile fe25519* out,
+ const unsigned char in[32]
+)
+{
+ uint8 ctr;
+
+ for (ctr = 0; ctr < 32; ctr++)
+ {
+ out->as_uint8[ctr] = in[ctr];
+ }
+ out->as_uint8[31] &= 0x7f; // make sure that the last bit is cleared.
+}
+
+static void
+fe25519_sub(
+ fe25519* out,
+ const fe25519* baseValue,
+ const fe25519* valueToSubstract
+)
+{
+ uint16 ctr;
+ int64 accu = 0;
+
+ // First subtract the most significant word, so that we may
+ // reduce the result "on the fly".
+ accu = baseValue->as_uint32[7];
+ accu -= valueToSubstract->as_uint32[7];
+
+ // We always set bit #31, and compensate this by subtracting 1 from the reduction
+ // value.
+ out->as_uint32[7] = ((uint32)accu) | 0x80000000ul;
+
+ accu = 19 * ((int32)(accu >> 31) - 1);
+ // ^ "-1" is the compensation for the "| 0x80000000ul" above.
+ // This choice makes sure, that the result will be positive!
+
+ for (ctr = 0; ctr < 7; ctr += 1)
+ {
+ accu += baseValue->as_uint32[ctr];
+ accu -= valueToSubstract->as_uint32[ctr];
+
+ out->as_uint32[ctr] = (uint32)accu;
+ accu >>= 32;
+ }
+ accu += out->as_uint32[7];
+ out->as_uint32[7] = (uint32)accu;
+}
+
+static void
+fe25519_add(
+ fe25519* out,
+ const fe25519* baseValue,
+ const fe25519* valueToAdd
+)
+{
+ uint16 ctr = 0;
+ uint64 accu = 0;
+
+ // We first add the most significant word, so that we may reduce
+ // "on the fly".
+ accu = baseValue->as_uint32[7];
+ accu += valueToAdd->as_uint32[7];
+ out->as_uint32[7] = ((uint32)accu) & 0x7ffffffful;
+
+ accu = ((uint32)(accu >> 31)) * 19;
+
+ for (ctr = 0; ctr < 7; ctr += 1)
+ {
+ accu += baseValue->as_uint32[ctr];
+ accu += valueToAdd->as_uint32[ctr];
+
+ out->as_uint32[ctr] = (uint32)accu;
+ accu >>= 32;
+ }
+ accu += out->as_uint32[7];
+ out->as_uint32[7] = (uint32)accu;
+}
+
+static void
+fe25519_mul(
+ fe25519* result,
+ const fe25519* in1,
+ const fe25519* in2
+)
+{
+ UN_512bitValue tmp;
+
+ multiply256x256(&tmp, in1, in2);
+ fe25519_reduceTo256Bits_asm(result,&tmp);
+}
+
+static void
+fe25519_square(
+ fe25519* result,
+ const fe25519* in
+)
+{
+ UN_512bitValue tmp;
+
+ square256(&tmp, in);
+ fe25519_reduceTo256Bits_asm(result,&tmp);
+}
+
+static void
+fe25519_reduceCompletely(
+ volatile fe25519* inout
+)
+{
+ uint32 numberOfTimesToSubstractPrime;
+ uint32 initialGuessForNumberOfTimesToSubstractPrime = inout->as_uint32[7] >>
+ 31;
+ uint64 accu;
+ uint8 ctr;
+
+ // add one additional 19 to the estimated number of reductions.
+ // Do the calculation without writing back the results to memory.
+ //
+ // The initial guess of required numbers of reductions is based
+ // on bit #32 of the most significant word.
+ // This initial guess may be wrong, since we might have a value
+ // v in the range
+ // 2^255 - 19 <= v < 2^255
+ // . After adding 19 to the value, we will be having the correct
+ // Number of required subtractions.
+ accu = initialGuessForNumberOfTimesToSubstractPrime * 19 + 19;
+
+ for (ctr = 0; ctr < 7; ctr++)
+ {
+ accu += inout->as_uint32[ctr];
+ accu >>= 32;
+ }
+ accu += inout->as_uint32[7];
+
+ numberOfTimesToSubstractPrime = (uint32)(accu >> 31);
+
+ // Do the reduction.
+ accu = numberOfTimesToSubstractPrime * 19;
+
+ for (ctr = 0; ctr < 7; ctr++)
+ {
+ accu += inout->as_uint32[ctr];
+ inout->as_uint32[ctr] = (uint32)accu;
+ accu >>= 32;
+ }
+ accu += inout->as_uint32[7];
+ inout->as_uint32[7] = accu & 0x7ffffffful;
+}
+
+/// We are already using a packed radix 16 representation for fe25519. The real use for this function
+/// is for architectures that use more bits for storing a fe25519 in a representation where multiplication
+/// may be calculated more efficiently.
+/// Here we simply copy the data.
+static void
+fe25519_pack(
+ unsigned char out[32],
+ volatile fe25519* in
+)
+{
+ uint8 ctr;
+
+ fe25519_reduceCompletely(in);
+
+ for (ctr = 0; ctr < 32; ctr++)
+ {
+ out[ctr] = in->as_uint8[ctr];
+ }
+}
+
+// Note, that r and x are allowed to overlap!
+static void
+fe25519_invert_useProvidedScratchBuffers(
+ fe25519* r,
+ const fe25519* x,
+ fe25519* t0,
+ fe25519* t1,
+ fe25519* t2
+)
+{
+ fe25519 *z11 = r; // store z11 in r (in order to save one temporary).
+ fe25519 *z2_10_0 = t1;
+ fe25519 *z2_50_0 = t2;
+ fe25519 *z2_100_0 = z2_10_0;
+
+ uint8 i;
+
+ {
+ fe25519 *z2 = z2_50_0;
+
+ /* 2 */ fe25519_square(z2, x);
+ /* 4 */ fe25519_square(t0, z2);
+ /* 8 */ fe25519_square(t0, t0);
+ /* 9 */ fe25519_mul(z2_10_0, t0, x);
+ /* 11 */ fe25519_mul(z11, z2_10_0, z2);
+
+ // z2 is dead.
+ }
+
+ /* 22 */ fe25519_square(t0, z11);
+ /* 2^5 - 2^0 = 31 */ fe25519_mul(z2_10_0, t0, z2_10_0);
+
+ /* 2^6 - 2^1 */ fe25519_square(t0, z2_10_0);
+ /* 2^7 - 2^2 */ fe25519_square(t0, t0);
+ /* 2^8 - 2^3 */ fe25519_square(t0, t0);
+ /* 2^9 - 2^4 */ fe25519_square(t0, t0);
+ /* 2^10 - 2^5 */ fe25519_square(t0, t0);
+ /* 2^10 - 2^0 */ fe25519_mul(z2_10_0, t0, z2_10_0);
+
+ /* 2^11 - 2^1 */ fe25519_square(t0, z2_10_0);
+
+ /* 2^20 - 2^10 */ for (i = 1; i < 10; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^20 - 2^0 */ fe25519_mul(z2_50_0, t0, z2_10_0);
+
+ /* 2^21 - 2^1 */ fe25519_square(t0, z2_50_0);
+
+ /* 2^40 - 2^20 */ for (i = 1; i < 20; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^40 - 2^0 */ fe25519_mul(t0, t0, z2_50_0);
+
+ /* 2^41 - 2^1 */ fe25519_square(t0, t0);
+
+ /* 2^50 - 2^10 */ for (i = 1; i < 10; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^50 - 2^0 */ fe25519_mul(z2_50_0, t0, z2_10_0);
+
+ /* 2^51 - 2^1 */ fe25519_square(t0, z2_50_0);
+
+ /* 2^100 - 2^50 */ for (i = 1; i < 50; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^100 - 2^0 */ fe25519_mul(z2_100_0, t0, z2_50_0);
+
+ /* 2^101 - 2^1 */ fe25519_square(t0, z2_100_0);
+
+ /* 2^200 - 2^100 */ for (i = 1; i < 100; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^200 - 2^0 */ fe25519_mul(t0, t0, z2_100_0);
+
+ /* 2^250 - 2^50 */ for (i = 0; i < 50; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^250 - 2^0 */ fe25519_mul(t0, t0, z2_50_0);
+
+ /* 2^255 - 2^5 */ for (i = 0; i < 5; i ++)
+ {
+ fe25519_square(t0, t0);
+ }
+ /* 2^255 - 21 */ fe25519_mul(r, t0, z11);
+}
+
+static void
+fe25519_setzero(
+ fe25519* out
+)
+{
+ uint8 ctr;
+
+ for (ctr = 0; ctr < 8; ctr++)
+ {
+ out->as_uint32[ctr] = 0;
+ }
+}
+
+static void
+fe25519_setone(
+ fe25519* out
+)
+{
+ uint8 ctr;
+
+ out->as_uint32[0] = 1;
+
+ for (ctr = 1; ctr < 8; ctr++)
+ {
+ out->as_uint32[ctr] = 0;
+ }
+}
+
+/*
+static void
+swapPointersConditionally (void **p1, void **p2, uint8 condition)
+{
+ // Secure version of this code:
+ //
+ // if (condition)
+ // {
+ // void *temp;
+ // temp = *p2;
+ // *p2 = *p1;
+ // *p1 = temp;
+ // }
+
+ uintptr mask = condition;
+ uintptr val1 = (uintptr) *p1;
+ uintptr val2 = (uintptr) *p2;
+ uintptr temp = val2 ^ val1;
+
+ mask = (uintptr)( - (intptr) mask );
+ temp ^= mask & (temp ^ val1);
+ val1 ^= mask & (val1 ^ val2);
+ val2 ^= mask & (val2 ^ temp);
+
+ *p1 = (void *) val1;
+ *p2 = (void *) val2;
+}
+*/
+
+static void
+fe25519_cswap(
+ fe25519* in1,
+ fe25519* in2,
+ int condition
+)
+{
+ int32 mask = condition;
+ uint32 ctr;
+
+ mask = -mask;
+
+ for (ctr = 0; ctr < 8; ctr++)
+ {
+ uint32 val1 = in1->as_uint32[ctr];
+ uint32 val2 = in2->as_uint32[ctr];
+ uint32 temp = val1;
+
+ val1 ^= mask & (val2 ^ val1);
+ val2 ^= mask & (val2 ^ temp);
+
+
+ in1->as_uint32[ctr] = val1;
+ in2->as_uint32[ctr] = val2;
+ }
+}
+
+// ****************************************************
+// Scalarmultiplication implementation.
+// ****************************************************
+
+typedef struct _ST_curve25519ladderstepWorkingState
+{
+ // The base point in affine coordinates
+ fe25519 x0;
+
+ // The two working points p, q, in projective coordinates. Possibly randomized.
+ fe25519 xp;
+ fe25519 zp;
+ fe25519 xq;
+ fe25519 zq;
+
+ volatile UN_256bitValue s;
+
+ int nextScalarBitToProcess;
+ uint8 previousProcessedBit;
+
+#ifdef DH_SWAP_BY_POINTERS
+ fe25519 *pXp;
+ fe25519 *pZp;
+ fe25519 *pXq;
+ fe25519 *pZq;
+#endif
+
+} ST_curve25519ladderstepWorkingState;
+
+static void
+curve25519_ladderstep(
+ ST_curve25519ladderstepWorkingState* pState
+)
+{
+ // Implements the "ladd-1987-m-3" differential-addition-and-doubling formulas
+ // Source: 1987 Montgomery "Speeding the Pollard and elliptic curve methods of factorization", page 261,
+ // fifth and sixth displays, plus common-subexpression elimination.
+ //
+ // Notation from the explicit formulas database:
+ // (X2,Z2) corresponds to (xp,zp),
+ // (X3,Z3) corresponds to (xq,zq)
+ // Result (X4,Z4) (X5,Z5) expected in (xp,zp) and (xq,zq)
+ //
+ // A = X2+Z2; AA = A^2; B = X2-Z2; BB = B^2; E = AA-BB; C = X3+Z3; D = X3-Z3;
+ // DA = D*A; CB = C*B; t0 = DA+CB; t1 = t0^2; X5 = Z1*t1; t2 = DA-CB;
+ // t3 = t2^2; Z5 = X1*t3; X4 = AA*BB; t4 = a24*E; t5 = BB+t4; Z4 = E*t5 ;
+ //
+ // Re-Ordered for using less temporaries.
+
+ fe25519 t1, t2;
+
+ #ifdef DH_SWAP_BY_POINTERS
+ fe25519 *b1=pState->pXp; fe25519 *b2=pState->pZp;
+ fe25519 *b3=pState->pXq; fe25519 *b4=pState->pZq;
+ #else
+ fe25519 *b1=&pState->xp; fe25519 *b2=&pState->zp;
+ fe25519 *b3=&pState->xq; fe25519 *b4=&pState->zq;
+ #endif
+
+ fe25519 *b5= &t1; fe25519 *b6=&t2;
+
+ fe25519_add(b5,b1,b2); // A = X2+Z2
+ fe25519_sub(b6,b1,b2); // B = X2-Z2
+ fe25519_add(b1,b3,b4); // C = X3+Z3
+ fe25519_sub(b2,b3,b4); // D = X3-Z3
+ fe25519_mul(b3,b2,b5); // DA= D*A
+ fe25519_mul(b2,b1,b6); // CB= C*B
+ fe25519_add(b1,b2,b3); // T0= DA+CB
+ fe25519_sub(b4,b3,b2); // T2= DA-CB
+ fe25519_square(b3,b1); // X5==T1= T0^2
+ fe25519_square(b1,b4); // T3= t2^2
+ fe25519_mul(b4,b1,&pState->x0); // Z5=X1*t3
+ fe25519_square(b1,b5); // AA=A^2
+ fe25519_square(b5,b6); // BB=B^2
+ fe25519_sub(b2,b1,b5); // E=AA-BB
+ fe25519_mul(b1,b5,b1); // X4= AA*BB
+ fe25519_mpyWith121666 (b6,b2); // T4 = a24*E
+ fe25519_add(b6,b6,b5); // T5 = BB + t4
+ fe25519_mul(b2,b6,b2); // Z4 = E*t5
+}
+
+static void
+curve25519_cswap(
+ ST_curve25519ladderstepWorkingState* state,
+ uint8 b
+)
+{
+ #ifdef DH_SWAP_BY_POINTERS
+ swapPointersConditionally ((void **) &state->pXp,(void **) &state->pXq,b);
+ swapPointersConditionally ((void **) &state->pZp,(void **) &state->pZq,b);
+ #else
+ fe25519_cswap (&state->xp, &state->xq,b);
+ fe25519_cswap (&state->zp, &state->zq,b);
+ #endif
+}
+
+#if DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS
+
+static void
+curve25519_doublePointP (ST_curve25519ladderstepWorkingState* pState)
+{
+ // Implement the doubling formula "dbl-1987-m-3"
+ // from 1987 Montgomery "Speeding the Pollard and elliptic curve methods of factorization",
+ // page 261, sixth display, plus common-subexpression elimination.
+ //
+ // Three operand code:
+ // A = X1+Z1
+ // AA = A^2
+ // B = X1-Z1
+ // BB = B^2
+ // C = AA-BB
+ // X3 = AA*BB
+ // t0 = a24*C
+ // t1 = BB+t0
+ // Z3 = C*t1
+
+ // Double the point input in the state variable "P". Use the State variable "Q" as temporary
+ // for storing A, AA and B, BB. Use the same temporary variable for A and AA respectively and
+ // B, BB respectively.
+ #ifdef DH_SWAP_BY_POINTERS
+ fe25519 *pA = pState->pXq;
+ fe25519 *pB = pState->pZq;
+ fe25519 *pX = pState->pXp;
+ fe25519 *pZ = pState->pZp;
+ #else
+ fe25519 *pA = &pState->xq;
+ fe25519 *pB = &pState->zq;
+ fe25519 *pX = &pState->xp;
+ fe25519 *pZ = &pState->zp;
+ #endif
+
+ // A = X1+Z1
+ fe25519_add(pA, pX, pZ);
+ // AA = A^2
+ fe25519_square (pA,pA);
+ // B = X1-Z1
+ fe25519_sub(pB, pX, pZ);
+ // BB = B^2
+ fe25519_square (pB,pB);
+ // X3 = AA*BB
+ fe25519_mul (pX,pA,pB);
+ // C = AA-BB
+ fe25519_sub (pZ,pA,pB);
+ // t0 = a24*C
+ fe25519_mpyWith121666 (pA,pZ);
+ // t1 = BB+t0
+ fe25519_add (pB,pA,pB);
+ // Z3 = C*t1
+ fe25519_mul (pZ,pZ,pB);
+}
+
+#endif // #ifdef DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS
+
+int
+crypto_scalarmult_curve25519(
+ unsigned char* r,
+ const unsigned char* s,
+ const unsigned char* p
+)
+{
+ ST_curve25519ladderstepWorkingState state;
+ unsigned char i;
+
+
+ // Prepare the scalar within the working state buffer.
+ for (i = 0; i < 32; i++)
+ {
+ state.s.as_uint8 [i] = s[i];
+ }
+#if DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS
+ // Due to explicit final doubling for the last three bits instead of a full ladderstep,
+ // the following line is no longer necessary.
+#else
+ state.s.as_uint8 [0] &= 248;
+#endif
+ state.s.as_uint8 [31] &= 127;
+ state.s.as_uint8 [31] |= 64;
+
+ // Copy the affine x-axis of the base point to the state.
+ fe25519_unpack (&state.x0, p);
+
+ // Prepare the working points within the working state struct.
+
+ fe25519_setone (&state.zq);
+ fe25519_cpy (&state.xq, &state.x0);
+
+ fe25519_setone(&state.xp);
+ fe25519_setzero(&state.zp);
+
+ state.nextScalarBitToProcess = 254;
+
+#ifdef DH_SWAP_BY_POINTERS
+ // we need to initially assign the pointers correctly.
+ state.pXp = &state.xp;
+ state.pZp = &state.zp;
+ state.pXq = &state.xq;
+ state.pZq = &state.zq;
+#endif
+
+ state.previousProcessedBit = 0;
+
+#if DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS
+ // Process all the bits except for the last three where we explicitly double the result.
+ while (state.nextScalarBitToProcess >= 3)
+#else
+ // Process all the bits except for the last three where we explicitly double the result.
+ while (state.nextScalarBitToProcess >= 0)
+#endif
+ {
+ uint8 byteNo = state.nextScalarBitToProcess >> 3;
+ uint8 bitNo = state.nextScalarBitToProcess & 7;
+ uint8 bit;
+ uint8 swap;
+
+ bit = 1 & (state.s.as_uint8 [byteNo] >> bitNo);
+ swap = bit ^ state.previousProcessedBit;
+ state.previousProcessedBit = bit;
+ curve25519_cswap(&state, swap);
+ curve25519_ladderstep(&state);
+ state.nextScalarBitToProcess --;
+ }
+
+ curve25519_cswap(&state,state.previousProcessedBit);
+
+#if DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS
+ curve25519_doublePointP (&state);
+ curve25519_doublePointP (&state);
+ curve25519_doublePointP (&state);
+#endif
+
+#ifdef DH_SWAP_BY_POINTERS
+ // optimize for stack usage.
+ fe25519_invert_useProvidedScratchBuffers (state.pZp, state.pZp, state.pXq,state.pZq,&state.x0);
+ fe25519_mul(state.pXp, state.pXp, state.pZp);
+ fe25519_reduceCompletely(state.pXp);
+
+ fe25519_pack (r, state.pXp);
+#else
+ // optimize for stack usage.
+ fe25519_invert_useProvidedScratchBuffers (&state.zp, &state.zp, &state.xq, &state.zq, &state.x0);
+ fe25519_mul(&state.xp, &state.xp, &state.zp);
+ fe25519_reduceCompletely(&state.xp);
+
+ fe25519_pack (r, &state.xp);
+#endif
+
+ return 0;
+}
+
+int
+crypto_scalarmult_curve25519_base(
+ unsigned char* q,
+ const unsigned char* n
+)
+{
+ static const uint8 base[32] =
+ {
+ 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+
+ return crypto_scalarmult_curve25519(q, n, base);
+}
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/sqr.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/sqr.s
new file mode 100644
index 00000000..3b190c92
--- /dev/null
+++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/sqr.s
@@ -0,0 +1,777 @@
+ .align 2
+ .global square256_asm
+ .type square256_asm, %function
+square256_asm:
+ push {r4-r7,lr}
+ mov r2, r8
+ mov r3, r9
+ mov r4, r10
+ mov r5, r11
+ push {r0-r5}
+
+ mov r12, r0
+ mov r4, r1
+ ldm r4!, {r0-r3}
+ push {r4}
+ /////////BEGIN LOW PART //////////////////////
+ ///SQR 128, in r0-r3
+ mov r8, r2
+ mov r9, r3
+ eor r4, r4
+ sub r2, r0
+ sbc r3, r1
+ sbc r4, r4
+ eor r2, r4
+ eor r3, r4
+ sub r2, r4
+ sbc r3, r4
+ mov r10, r2
+ mov r11, r3
+ //SQR64, in: r0, r1, out: r0-r3, used: r0-r6
+ mov r2, r0
+ eor r3, r3
+ sub r2, r1
+ sbc r3, r3
+ eor r2, r3
+ sub r2, r3
+ lsr r3, r0, #16
+ uxth r0, r0
+ mov r4, r0
+ mul r4, r3
+ mul r0, r0
+ mul r3, r3
+ lsr r5, r4, #16
+ lsl r4, #16
+ add r0, r4
+ adc r3, r5
+ add r0, r4
+ adc r3, r5
+ lsr r4, r1, #16
+ uxth r1, r1
+ mov r5, r1
+ mul r5, r4
+ mul r1, r1
+ mul r4, r4
+ eor r6, r6
+ add r1, r3
+ adc r4, r6
+ lsr r3, r5, #16
+ lsl r5, r5, #16
+ add r1, r5
+ adc r4, r3
+ add r1, r5
+ adc r3, r4
+ lsr r4, r2, #16
+ uxth r2, r2
+ mov r5, r2
+ mul r5, r4
+ mul r2, r2
+ mul r4, r4
+ lsr r6, r5, #16
+ lsl r5, #16
+ add r2, r5
+ adc r4, r6
+ add r5, r2
+ adc r6, r4
+ eor r7, r7
+ mov r2, r1
+ sub r1, r5
+ sbc r2, r6
+ sbc r7, r7
+ add r1, r0
+ adc r2, r3
+ adc r7, r3
+ mov r3, r12
+ stm r3!, {r0-r1}
+ push {r3}
+
+ mov r12, r0
+ mov r0, r8
+ mov r8, r1
+ mov r1, r9
+ mov r9, r2
+ //SQR64, in: r0, r1, out: r0-r3, used: r0-r6
+ mov r2, r0
+ eor r3, r3
+ sub r2, r1
+ sbc r3, r3
+ eor r2, r3
+ sub r2, r3
+ lsr r3, r0, #16
+ uxth r0, r0
+ mov r4, r0
+ mul r4, r3
+ mul r0, r0
+ mul r3, r3
+ lsr r5, r4, #16
+ lsl r4, #16
+ add r0, r4
+ adc r3, r5
+ add r0, r4
+ adc r3, r5
+ lsr r4, r1, #16
+ uxth r1, r1
+ mov r5, r1
+ mul r5, r4
+ mul r1, r1
+ mul r4, r4
+ eor r6, r6
+ add r1, r3
+ adc r4, r6
+ lsr r3, r5, #16
+ lsl r5, r5, #16
+ add r1, r5
+ adc r4, r3
+ add r1, r5
+ adc r3, r4
+ lsr r4, r2, #16
+ uxth r2, r2
+ mov r5, r2
+ mul r5, r4
+ mul r2, r2
+ mul r4, r4
+ lsr r6, r5, #16
+ lsl r5, #16
+ add r2, r5
+ adc r4, r6
+ add r5, r2
+ adc r6, r4
+ eor r4, r4
+ mov r2, r1
+ sub r1, r5
+ sbc r2, r6
+ sbc r4, r4
+ add r1, r0
+ adc r2, r3
+ adc r3, r4
+ eor r4, r4
+ mov r6, r9
+ add r0, r6
+ adc r7, r1
+ adc r2, r4
+ adc r3, r4
+ mov r1, r11
+ mov r11, r0
+ mov r0, r10
+ mov r9, r2
+ mov r10,r3
+ //SQR64, in: r0, r1, out: r0-r3, used: r0-r6
+ mov r2, r0
+ eor r3, r3
+ sub r2, r1
+ sbc r3, r3
+ eor r2, r3
+ sub r2, r3
+ lsr r3, r0, #16
+ uxth r0, r0
+ mov r4, r0
+ mul r4, r3
+ mul r0, r0
+ mul r3, r3
+ lsr r5, r4, #16
+ lsl r4, #16
+ add r0, r4
+ adc r3, r5
+ add r0, r4
+ adc r3, r5
+ lsr r4, r1, #16
+ uxth r1, r1
+ mov r5, r1
+ mul r5, r4
+ mul r1, r1
+ mul r4, r4
+ eor r6, r6
+ add r1, r3
+ adc r4, r6
+ lsr r3, r5, #16
+ lsl r5, r5, #16
+ add r1, r5
+ adc r4, r3
+ add r1, r5
+ adc r3, r4
+ lsr r4, r2, #16
+ uxth r2, r2
+ mov r5, r2
+ mul r5, r4
+ mul r2, r2
+ mul r4, r4
+ lsr r6, r5, #16
+ lsl r5, #16
+ add r2, r5
+ adc r4, r6
+ add r5, r2
+ adc r6, r4
+ eor r4, r4
+ mov r2, r1
+ sub r1, r5
+ sbc r2, r6
+ sbc r4, r4
+ add r1, r0
+ adc r2, r3
+ adc r3, r4
+ mov r6, r11
+ mov r4, r11
+ mov r5, r7
+ sub r6, r0
+ sbc r7, r1
+ sbc r4, r2
+ sbc r5, r3
+ eor r1, r1
+ sbc r1, r1
+ mov r2, r12
+ mov r3, r8
+ add r2, r6
+ adc r3, r7
+ mov r6, r9
+ mov r7, r10
+ adc r4, r6
+ adc r5, r7
+ adc r6, r1
+ adc r7, r1
+ //results r12, r8, r2-r7
+ /////////END LOW PART ////////////////////////
+ pop {r0,r1}
+ stm r0!, {r2, r3}
+ push {r0, r4-r7}
+ ldm r1, {r0-r3}
+ /////////BEGIN HIGH PART //////////////////////
+ ///SQR 128, in r0-r3
+ mov r8, r2
+ mov r9, r3
+ eor r4, r4
+ sub r2, r0
+ sbc r3, r1
+ sbc r4, r4
+ eor r2, r4
+ eor r3, r4
+ sub r2, r4
+ sbc r3, r4
+ mov r10, r2
+ mov r11, r3
+ //SQR64, in: r0, r1, out: r0-r3, used: r0-r6
+ mov r2, r0
+ eor r3, r3
+ sub r2, r1
+ sbc r3, r3
+ eor r2, r3
+ sub r2, r3
+ lsr r3, r0, #16
+ uxth r0, r0
+ mov r4, r0
+ mul r4, r3
+ mul r0, r0
+ mul r3, r3
+ lsr r5, r4, #16
+ lsl r4, #16
+ add r0, r4
+ adc r3, r5
+ add r0, r4
+ adc r3, r5
+ lsr r4, r1, #16
+ uxth r1, r1
+ mov r5, r1
+ mul r5, r4
+ mul r1, r1
+ mul r4, r4
+ eor r6, r6
+ add r1, r3
+ adc r4, r6
+ lsr r3, r5, #16
+ lsl r5, r5, #16
+ add r1, r5
+ adc r4, r3
+ add r1, r5
+ adc r3, r4
+ lsr r4, r2, #16
+ uxth r2, r2
+ mov r5, r2
+ mul r5, r4
+ mul r2, r2
+ mul r4, r4
+ lsr r6, r5, #16
+ lsl r5, #16
+ add r2, r5
+ adc r4, r6
+ add r5, r2
+ adc r6, r4
+ eor r7, r7
+ mov r2, r1
+ sub r1, r5
+ sbc r2, r6
+ sbc r7, r7
+ add r1, r0
+ adc r2, r3
+ adc r7, r3
+ mov r12, r0
+ mov r0, r8
+ mov r8, r1
+ mov r1, r9
+ mov r9, r2
+ //SQR64, in: r0, r1, out: r0-r3, used: r0-r6
+ mov r2, r0
+ eor r3, r3
+ sub r2, r1
+ sbc r3, r3
+ eor r2, r3
+ sub r2, r3
+ lsr r3, r0, #16
+ uxth r0, r0
+ mov r4, r0
+ mul r4, r3
+ mul r0, r0
+ mul r3, r3
+ lsr r5, r4, #16
+ lsl r4, #16
+ add r0, r4
+ adc r3, r5
+ add r0, r4
+ adc r3, r5
+ lsr r4, r1, #16
+ uxth r1, r1
+ mov r5, r1
+ mul r5, r4
+ mul r1, r1
+ mul r4, r4
+ eor r6, r6
+ add r1, r3
+ adc r4, r6
+ lsr r3, r5, #16
+ lsl r5, r5, #16
+ add r1, r5
+ adc r4, r3
+ add r1, r5
+ adc r3, r4
+ lsr r4, r2, #16
+ uxth r2, r2
+ mov r5, r2
+ mul r5, r4
+ mul r2, r2
+ mul r4, r4
+ lsr r6, r5, #16
+ lsl r5, #16
+ add r2, r5
+ adc r4, r6
+ add r5, r2
+ adc r6, r4
+ eor r4, r4
+ mov r2, r1
+ sub r1, r5
+ sbc r2, r6
+ sbc r4, r4
+ add r1, r0
+ adc r2, r3
+ adc r3, r4
+ eor r4, r4
+ mov r6, r9
+ add r0, r6
+ adc r7, r1
+ adc r2, r4
+ adc r3, r4
+ mov r1, r11
+ mov r11, r0
+ mov r0, r10
+ mov r9, r2
+ mov r10,r3
+ //SQR64, in: r0, r1, out: r0-r3, used: r0-r6
+ mov r2, r0
+ eor r3, r3
+ sub r2, r1
+ sbc r3, r3
+ eor r2, r3
+ sub r2, r3
+ lsr r3, r0, #16
+ uxth r0, r0
+ mov r4, r0
+ mul r4, r3
+ mul r0, r0
+ mul r3, r3
+ lsr r5, r4, #16
+ lsl r4, #16
+ add r0, r4
+ adc r3, r5
+ add r0, r4
+ adc r3, r5
+ lsr r4, r1, #16
+ uxth r1, r1
+ mov r5, r1
+ mul r5, r4
+ mul r1, r1
+ mul r4, r4
+ eor r6, r6
+ add r1, r3
+ adc r4, r6
+ lsr r3, r5, #16
+ lsl r5, r5, #16
+ add r1, r5
+ adc r4, r3
+ add r1, r5
+ adc r3, r4
+ lsr r4, r2, #16
+ uxth r2, r2
+ mov r5, r2
+ mul r5, r4
+ mul r2, r2
+ mul r4, r4
+ lsr r6, r5, #16
+ lsl r5, #16
+ add r2, r5
+ adc r4, r6
+ add r5, r2
+ adc r6, r4
+ eor r4, r4
+ mov r2, r1
+ sub r1, r5
+ sbc r2, r6
+ sbc r4, r4
+ add r1, r0
+ adc r2, r3
+ adc r3, r4
+ mov r6, r11
+ mov r4, r11
+ mov r5, r7
+ sub r6, r0
+ sbc r7, r1
+ sbc r4, r2
+ sbc r5, r3
+ eor r1, r1
+ sbc r1, r1
+ mov r2, r12
+ mov r3, r8
+ add r2, r6
+ adc r3, r7
+ mov r6, r9
+ mov r7, r10
+ adc r4, r6
+ adc r5, r7
+ adc r6, r1
+ adc r7, r1
+ //results r12, r8, r2-r7
+ /////////END HIGH PART ////////////////////////
+ mov r0, r12
+ mov r1, r8
+ mov r8, r4
+ mov r9, r5
+ mov r10, r6
+ mov r11, r7
+ pop {r4}
+ mov r12, r4//str
+ pop {r4-r7}
+ add r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r7
+ mov r4, r12
+ stm r4!, {r0-r3}//low part
+ mov r4, r8
+ mov r5, r9
+ mov r6, r10
+ mov r7, r11
+ eor r0, r0
+ adc r4, r0
+ adc r5, r0
+ adc r6, r0
+ adc r7, r0
+ pop {r0, r1} //r0->out, r1, in
+ push {r0,r4-r7}
+ ldm r1, {r0-r7}
+ sub r0, r4
+ sbc r1, r5
+ sbc r2, r6
+ sbc r3, r7
+ sbc r4, r4
+ eor r0, r4
+ eor r1, r4
+ eor r2, r4
+ eor r3, r4
+ sub r0, r4
+ sbc r1, r4
+ sbc r2, r4
+ sbc r3, r4
+ //////////BEGIN MIDDLE PART////////////////
+ ///SQR 128, in r0-r3
+ mov r8, r2
+ mov r9, r3
+ eor r4, r4
+ sub r2, r0
+ sbc r3, r1
+ sbc r4, r4
+ eor r2, r4
+ eor r3, r4
+ sub r2, r4
+ sbc r3, r4
+ mov r10, r2
+ mov r11, r3
+ //SQR64, in: r0, r1, out: r0-r3, used: r0-r6
+ mov r2, r0
+ eor r3, r3
+ sub r2, r1
+ sbc r3, r3
+ eor r2, r3
+ sub r2, r3
+ lsr r3, r0, #16
+ uxth r0, r0
+ mov r4, r0
+ mul r4, r3
+ mul r0, r0
+ mul r3, r3
+ lsr r5, r4, #16
+ lsl r4, #16
+ add r0, r4
+ adc r3, r5
+ add r0, r4
+ adc r3, r5
+ lsr r4, r1, #16
+ uxth r1, r1
+ mov r5, r1
+ mul r5, r4
+ mul r1, r1
+ mul r4, r4
+ eor r6, r6
+ add r1, r3
+ adc r4, r6
+ lsr r3, r5, #16
+ lsl r5, r5, #16
+ add r1, r5
+ adc r4, r3
+ add r1, r5
+ adc r3, r4
+ lsr r4, r2, #16
+ uxth r2, r2
+ mov r5, r2
+ mul r5, r4
+ mul r2, r2
+ mul r4, r4
+ lsr r6, r5, #16
+ lsl r5, #16
+ add r2, r5
+ adc r4, r6
+ add r5, r2
+ adc r6, r4
+ eor r7, r7
+ mov r2, r1
+ sub r1, r5
+ sbc r2, r6
+ sbc r7, r7
+ add r1, r0
+ adc r2, r3
+ adc r7, r3
+ mov r12, r0
+ mov r0, r8
+ mov r8, r1
+ mov r1, r9
+ mov r9, r2
+ //SQR64, in: r0, r1, out: r0-r3, used: r0-r6
+ mov r2, r0
+ eor r3, r3
+ sub r2, r1
+ sbc r3, r3
+ eor r2, r3
+ sub r2, r3
+ lsr r3, r0, #16
+ uxth r0, r0
+ mov r4, r0
+ mul r4, r3
+ mul r0, r0
+ mul r3, r3
+ lsr r5, r4, #16
+ lsl r4, #16
+ add r0, r4
+ adc r3, r5
+ add r0, r4
+ adc r3, r5
+ lsr r4, r1, #16
+ uxth r1, r1
+ mov r5, r1
+ mul r5, r4
+ mul r1, r1
+ mul r4, r4
+ eor r6, r6
+ add r1, r3
+ adc r4, r6
+ lsr r3, r5, #16
+ lsl r5, r5, #16
+ add r1, r5
+ adc r4, r3
+ add r1, r5
+ adc r3, r4
+ lsr r4, r2, #16
+ uxth r2, r2
+ mov r5, r2
+ mul r5, r4
+ mul r2, r2
+ mul r4, r4
+ lsr r6, r5, #16
+ lsl r5, #16
+ add r2, r5
+ adc r4, r6
+ add r5, r2
+ adc r6, r4
+ eor r4, r4
+ mov r2, r1
+ sub r1, r5
+ sbc r2, r6
+ sbc r4, r4
+ add r1, r0
+ adc r2, r3
+ adc r3, r4
+ eor r4, r4
+ mov r6, r9
+ add r0, r6
+ adc r7, r1
+ adc r2, r4
+ adc r3, r4
+ mov r1, r11
+ mov r11, r0
+ mov r0, r10
+ mov r9, r2
+ mov r10,r3
+ //SQR64, in: r0, r1, out: r0-r3, used: r0-r6
+ mov r2, r0
+ eor r3, r3
+ sub r2, r1
+ sbc r3, r3
+ eor r2, r3
+ sub r2, r3
+ lsr r3, r0, #16
+ uxth r0, r0
+ mov r4, r0
+ mul r4, r3
+ mul r0, r0
+ mul r3, r3
+ lsr r5, r4, #16
+ lsl r4, #16
+ add r0, r4
+ adc r3, r5
+ add r0, r4
+ adc r3, r5
+ lsr r4, r1, #16
+ uxth r1, r1
+ mov r5, r1
+ mul r5, r4
+ mul r1, r1
+ mul r4, r4
+ eor r6, r6
+ add r1, r3
+ adc r4, r6
+ lsr r3, r5, #16
+ lsl r5, r5, #16
+ add r1, r5
+ adc r4, r3
+ add r1, r5
+ adc r3, r4
+ lsr r4, r2, #16
+ uxth r2, r2
+ mov r5, r2
+ mul r5, r4
+ mul r2, r2
+ mul r4, r4
+ lsr r6, r5, #16
+ lsl r5, #16
+ add r2, r5
+ adc r4, r6
+ add r5, r2
+ adc r6, r4
+ eor r4, r4
+ mov r2, r1
+ sub r1, r5
+ sbc r2, r6
+ sbc r4, r4
+ add r1, r0
+ adc r2, r3
+ adc r3, r4
+ mov r6, r11
+ mov r4, r11
+ mov r5, r7
+ sub r6, r0
+ sbc r7, r1
+ sbc r4, r2
+ sbc r5, r3
+ eor r1, r1
+ sbc r1, r1
+ mov r2, r12
+ mov r3, r8
+ add r2, r6
+ adc r3, r7
+ mov r6, r9
+ mov r7, r10
+ adc r4, r6
+ adc r5, r7
+ adc r6, r1
+ adc r7, r1
+ //results r12, r8, r2-r7
+ //////////END MIDDLE PART//////////////////
+ mvn r2, r2
+ mvn r3, r3
+ mvn r4, r4
+ mvn r5, r5
+ mvn r6, r6
+ mvn r7, r7
+ pop {r1}
+ push {r4-r7}
+ mov r4, #1
+ asr r4, #1
+ ldm r1!, {r4-r7}
+ mov r0, r12
+ mov r12, r1 ////////ref
+ mov r1, r8
+ mvn r0, r0
+ mvn r1, r1
+ adc r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r7
+ eor r4, r4
+ adc r4, r4
+ mov r8, r4 //carry A --ini
+ mov r4, r12
+ ldm r4, {r4-r7}
+ add r0, r4
+ adc r1, r5
+ adc r2, r6
+ adc r3, r7
+ mov r9, r4
+ mov r4, r12
+ stm r4!, {r0-r3}
+ mov r12, r4
+ mov r4, r9
+ pop {r0-r3}
+ adc r4, r0
+ adc r5, r1
+ adc r6, r2
+ adc r7, r3
+ eor r0, r0
+ adc r0, r0
+ mov r9, r0 //carry B --ini
+ mov r0, r8
+ asr r0, #1 //carry A --end
+ pop {r0-r3}
+ adc r4, r0
+ adc r5, r1
+ adc r6, r2
+ adc r7, r3
+ mov r8, r0
+ mov r0, r12
+ stm r0!, {r4-r7}
+ mov r11, r0
+ mov r0, r8
+ eor r4, r4
+ mov r5, r9
+ adc r5, r4 //carry B --end
+ mvn r6, r4
+ add r5, r6
+ adc r6, r4
+ add r0, r5
+ adc r1, r6
+ adc r2, r6
+ adc r3, r6
+ mov r7, r11
+ stm r7!, {r0-r3}
+
+ pop {r3-r6}
+ mov r8, r3
+ mov r9, r4
+ mov r10, r5
+ mov r11, r6
+ pop {r4-r7,pc}
+ bx lr
+ .size square256_asm, .-square256_asm