diff options
Diffstat (limited to '')
30 files changed, 4941 insertions, 0 deletions
diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/.gitignore b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/.gitignore new file mode 100644 index 00000000..5841cb8e --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/.gitignore @@ -0,0 +1,3 @@ +*.log +*.elf +*.bin diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/Makefile b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/Makefile new file mode 100644 index 00000000..7be9a53a --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/Makefile @@ -0,0 +1,184 @@ +FUNCS = do_nothing stack_8w stack_64w \ + hashtest_sha256 hashtest_sha512 \ + hashtest_sha3_256 hashtest_sha3_512 \ + aes128block_test aes128sched_test \ + aes256block_test aes256sched_test \ + aes128gcm_test aes128eax_test \ + aes128ccm_test \ + salsa20_test chacha20_test \ + poly1305_test hmacsha256_test \ + curve25519_test \ + norx_test + +AEADS = aeadperf_aes128gcm \ + aeadperf_aes128ccm \ + aeadperf_aes128eax \ + aeadperf_aes256gcm \ + aeadperf_aes256ccm \ + aeadperf_aes256eax \ + aeadperf_norx \ + aeadperf_chacha20poly1305 +TESTS = testcurve25519 testaes testmodes testsalsa20 testsha1 testsha2 \ + testsha3 testpoly1305 testnorx testchacha20poly1305 testdrbg +ARCHS = stm32f0 stm32f1 stm32f3 efm32 qemucm3 + +all: $(patsubst %,%.stm32f0.bin,$(FUNCS) $(AEADS) $(TESTS)) \ + $(patsubst %,%.stm32f1.bin,$(FUNCS) $(AEADS) $(TESTS)) \ + $(patsubst %,%.stm32f3.bin,$(FUNCS) $(AEADS) $(TESTS)) \ + $(patsubst %,%.efm32.bin,$(FUNCS) $(AEADS) $(TESTS)) \ + $(patsubst %,%.qemucm3.bin,$(FUNCS) $(AEADS) $(TESTS)) + +%.stm32f0.elf: + arm-none-eabi-gcc $(CFLAGS) $(CFLAGS_$*) $(LDFLAGS) -T linkscript.stm32f0.ld -mcpu=cortex-m0 -DCORTEX_M0 -o $@ $^ -DTEST=$* -lgcc + +%.stm32f1.elf: + arm-none-eabi-gcc $(CFLAGS) $(CFLAGS_$*) $(LDFLAGS) -T linkscript.stm32f1.ld -mcpu=cortex-m3 -DCORTEX_M3 -o $@ $^ -DTEST=$* -lgcc + +%.stm32f3.elf: + arm-none-eabi-gcc $(CFLAGS) $(CFLAGS_$*) $(LDFLAGS) -T linkscript.stm32f3.ld -mcpu=cortex-m4 -DCORTEX_M4 -o $@ $^ -DTEST=$* -lgcc + +%.efm32.elf: + arm-none-eabi-gcc $(CFLAGS) $(CFLAGS_$*) $(LDFLAGS) -T linkscript.efm32.ld -mcpu=cortex-m0 -DCORTEX_M0 -o $@ $^ -DTEST=$* -lgcc + +%.qemucm3.elf: + arm-none-eabi-gcc $(CFLAGS) $(CFLAGS_$*) $(LDFLAGS) -T linkscript.qemucm3.ld -mcpu=cortex-m3 -DCORTEX_M3 -o $@ $^ -DTEST=$* -lgcc + +%.bin: %.elf + arm-none-eabi-objcopy -O binary $< $@ +.PRECIOUS: %.bin + +AES_OPTIONS = -DCF_AES_ENCRYPT_ONLY=1 -DCF_SIDE_CHANNEL_PROTECTION=0 +AES128_OPTIONS = -DCF_AES_MAXROUNDS=AES128_ROUNDS +AES256_OPTIONS = -DCF_AES_MAXROUNDS=AES256_ROUNDS + +AEADPERF_BRACKET = -DBRACKET_MODE=1 -DBRACKET_START=0 -DBRACKET_END=256 -DBRACKET_STEP=4 + +CFLAGS_aes128block_test = $(AES_OPTIONS) $(AES128_OPTIONS) +CFLAGS_aes128sched_test = $(AES_OPTIONS) $(AES128_OPTIONS) +CFLAGS_aes128gcm_test = $(AES_OPTIONS) $(AES128_OPTIONS) +CFLAGS_aes128eax_test = $(AES_OPTIONS) $(AES128_OPTIONS) +CFLAGS_aes128ccm_test = $(AES_OPTIONS) $(AES128_OPTIONS) +CFLAGS_poly1305_test = $(AES_OPTIONS) $(AES128_OPTIONS) + +CFLAGS_aeadperf_aes128gcm = $(AES_OPTIONS) $(AES128_OPTIONS) $(AEADPERF_BRACKET) +CFLAGS_aeadperf_aes128eax = $(AES_OPTIONS) $(AES128_OPTIONS) $(AEADPERF_BRACKET) +CFLAGS_aeadperf_aes128ccm = $(AES_OPTIONS) $(AES128_OPTIONS) $(AEADPERF_BRACKET) +CFLAGS_aeadperf_aes256gcm = $(AES_OPTIONS) $(AES256_OPTIONS) $(AEADPERF_BRACKET) +CFLAGS_aeadperf_aes256eax = $(AES_OPTIONS) $(AES256_OPTIONS) $(AEADPERF_BRACKET) +CFLAGS_aeadperf_aes256ccm = $(AES_OPTIONS) $(AES256_OPTIONS) $(AEADPERF_BRACKET) +CFLAGS_aeadperf_norx = $(AEADPERF_BRACKET) +CFLAGS_aeadperf_chacha20poly1305 = $(AEADPERF_BRACKET) + +CFLAGS_aes256block_test = $(AES_OPTIONS) $(AES256_OPTIONS) +CFLAGS_aes256sched_test = $(AES_OPTIONS) $(AES256_OPTIONS) + +CFLAGS_testaes = -DCF_SIDE_CHANNEL_PROTECTION=0 + +CFLAGS = -I./ext -I../ext -I.. -Os -ffunction-sections -g \ + -Wall -Werror -std=gnu99 -mthumb +LDFLAGS = -nostartfiles -nostdlib -Wl,-gc-sections +CURVESRCS = unacl/cortex_m0_mpy121666.s unacl/cortex_m0_reduce25519.s unacl/mul.s unacl/sqr.s +SRCS = boot.c memcpy.s memset.s semihost.c semihost.s \ + ../sha1.c ../sha256.c ../sha512.c ../sha3.c ../blockwise.c ../chash.c \ + ../curve25519.c ../poly1305.c \ + ../aes.c ../eax.c ../gcm.c ../cbcmac.c ../ccm.c \ + ../modes.c ../cmac.c ../gf128.c \ + ../hmac.c ../pbkdf2.c ../salsa20.c ../chacha20.c \ + ../norx.c ../chacha20poly1305.c ../drbg.c + +$(patsubst %,%.stm32f0.elf, $(FUNCS) $(AEADS)): $(SRCS) main.c $(CURVESRCS) +$(patsubst %,%.stm32f1.elf, $(FUNCS) $(AEADS)): $(SRCS) main.c $(CURVESRCS) +$(patsubst %,%.stm32f3.elf, $(FUNCS) $(AEADS)): $(SRCS) main.c $(CURVESRCS) +$(patsubst %,%.efm32.elf, $(FUNCS) $(AEADS)): $(SRCS) main.c $(CURVESRCS) +$(patsubst %,%.qemucm3.elf, $(FUNCS) $(AEADS)): $(SRCS) main.c $(CURVESRCS) + +$(patsubst %,testcurve25519.%.elf, $(ARCHS)): $(SRCS) $(CURVESRCS) ../testcurve25519.c +$(patsubst %,testaes.%.elf, $(ARCHS)): $(SRCS) ../testaes.c +$(patsubst %,testmodes.%.elf, $(ARCHS)): $(SRCS) ../testmodes.c +$(patsubst %,testsalsa20.%.elf, $(ARCHS)): $(SRCS) ../testsalsa20.c +$(patsubst %,testsha1.%.elf, $(ARCHS)): $(SRCS) ../testsha1.c +$(patsubst %,testsha2.%.elf, $(ARCHS)): $(SRCS) ../testsha2.c +$(patsubst %,testsha3.%.elf, $(ARCHS)): $(SRCS) ../testsha3.c +$(patsubst %,testpoly1305.%.elf, $(ARCHS)): $(SRCS) ../testpoly1305.c +$(patsubst %,testnorx.%.elf, $(ARCHS)): $(SRCS) ../testnorx.c +$(patsubst %,testchacha20poly1305.%.elf, $(ARCHS)): $(SRCS) ../testchacha20poly1305.c +$(patsubst %,testdrbg.%.elf, $(ARCHS)): $(SRCS) ../testdrbg.c + +run.%.qemucm3: %.qemucm3.bin + arm-none-eabi-readelf -l $(patsubst %.bin,%.elf,$^) > $@.log + qemu-system-gnuarmeclipse -verbose -verbose -M STM32-P103 -kernel $^ -semihosting -nographic -monitor null -serial null 2>> $@.log + cat $@.log + +run.%.efm32: %.efm32.elf + arm-none-eabi-readelf -l $^ > $@.log + echo '-----' >> $@.log + openocd -f openocd.efm32.cfg >> $@.log & + arm-none-eabi-gdb --quiet --batch-silent \ + $^ \ + -ex 'target remote :3333' \ + -ex 'monitor reset halt' \ + -ex 'load' \ + -ex 'monitor arm semihosting enable' \ + -ex 'monitor reset run' \ + -ex 'monitor wait_halt 720000' \ + -ex 'monitor shutdown' + +run.%.stm32f0: %.stm32f0.elf + arm-none-eabi-readelf -l $^ > $@.log + echo '-----' >> $@.log + openocd -f openocd.stm32f0.cfg >> $@.log & + arm-none-eabi-gdb --quiet --batch-silent \ + $^ \ + -ex 'target remote :3333' \ + -ex 'monitor reset halt' \ + -ex 'load' \ + -ex 'monitor arm semihosting enable' \ + -ex 'monitor reset run' \ + -ex 'monitor wait_halt 720000' \ + -ex 'monitor shutdown' + +run.%.stm32f1: %.stm32f1.elf + arm-none-eabi-readelf -l $^ > $@.log + echo '-----' >> $@.log + openocd -f openocd.stm32f1.cfg >> $@.log & + arm-none-eabi-gdb --quiet --batch-silent \ + $^ \ + -ex 'target remote :3333' \ + -ex 'monitor reset halt' \ + -ex 'load' \ + -ex 'monitor arm semihosting enable' \ + -ex 'monitor reset run' \ + -ex 'monitor wait_halt 720000' \ + -ex 'monitor shutdown' + +run.%.stm32f3: %.stm32f3.elf + arm-none-eabi-readelf -l $^ > $@.log + echo '-----' >> $@.log + openocd -f openocd.stm32f3.cfg >> $@.log & + arm-none-eabi-gdb --quiet --batch-silent \ + $^ \ + -ex 'target remote :3333' \ + -ex 'monitor reset halt' \ + -ex 'load' \ + -ex 'monitor arm semihosting enable' \ + -ex 'monitor reset run' \ + -ex 'monitor wait_halt 720000' \ + -ex 'monitor shutdown' + +test: $(patsubst %,run.%.qemucm3,$(FUNCS) $(TESTS)) +.PHONY: test + +perf.stm32f0: $(patsubst %,run.%.stm32f0,$(FUNCS)) +.PHONY: perf.stm32f0 + +test.stm32f0: $(patsubst %,run.%.stm32f0,$(FUNCS) $(TESTS)) +.PHONY: test.stm32f0 + +test.stm32f1: $(patsubst %,run.%.stm32f1,$(FUNCS) $(TESTS)) +.PHONY: test.stm32f1 + +test.stm32f3: $(patsubst %,run.%.stm32f3,$(FUNCS) $(TESTS)) +.PHONY: test.stm32f3 + +clean: + rm -rf *.log *.elf *.bin diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/analyse.py b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/analyse.py new file mode 100644 index 00000000..d2c456fe --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/analyse.py @@ -0,0 +1,207 @@ +import subprocess +import sys +import re + +function_intro_re = re.compile(r'^(?P<addr>[0-9a-fA-F]{8}) <(?P<name>[a-zA-Z0-9\._]+)>:$') +insn_re = re.compile(r'^\s+(?P<addr>[0-9a-fA-F]+):\s+(?P<insn>[0-9a-fA-F ]+)\s+\t(?P<op>.*)$') + +class Instruction: + def __init__(self, addr, insn, op): + self.addr = long(addr, 16) + self.insn = insn + + args = op.split('\t', 1) + + self.op = args[0].strip() + if len(args) == 2: + comment = args[1].strip().split(';', 1) + else: + comment = args + + self.args = comment[0].strip() + + if len(comment) == 2: + self.comment = comment[1].strip() + else: + self.comment = '' + + def __repr__(self): + return '<insn %r>' % (self.__dict__) + + +def literal_branch_target(t): + return ' <' in t + +class Function: + def __init__(self, addr, name): + self.name = name + self.addr = long(addr, 16) + self.insns = [] + self.calls = [] + + def __repr__(self): + return '<%s %d instructions>' % (self.name, len(self.insns)) + + def add_insn(self, insn): + self.insns.append(Instruction(**insn)) + + def contains_addr(self, addr): + if self.insns: + return addr >= self.addr and addr <= self.insns[-1].addr + else: + return addr == self.addr + + def dump(self): + print self.name + ':' + for insn in self.insns: + print ' ', '%04x' % insn.addr + ':', insn.op, insn.args, '\t;', insn.comment + + def get_literal_word(self, addr): + for insn in self.insns: + if insn.addr == addr and insn.op == '.word': + w = int(insn.args, 16) + if w & 0x80000000: + w = -(w ^ 0xffffffff) + 1 + return w + return None + + def analyse(self, prog): + self.stack_guess = None + regs = {} + + for insn in self.insns: + # stack adjustment with literal + if insn.op == 'sub' and insn.args.startswith('sp, ') and self.stack_guess is None: + sz = int(insn.args.split('#', 1)[1]) + self.stack_guess = sz + + # literal pool loads + if insn.op == 'ldr' and ', [pc, #' in insn.args: + reg, offset = insn.args.split(', [pc, #') + offset = int(offset.replace(']', '')) + word = self.get_literal_word(insn.addr + offset + 2) + if word is not None: + regs[reg] = word + + if insn.op == 'add' and insn.args.startswith('sp, r') and self.stack_guess is None: + reg = insn.args.split(', ')[1] + if reg in regs: + self.stack_guess = regs[reg] + + # static branches + if insn.op[0] == 'b' and literal_branch_target(insn.args): + target = long(insn.args.split(' <', 1)[0], 16) + + targetf = prog.function_at_addr(target) + + if targetf and targetf != self: + self.calls.append(targetf) + + if self.stack_guess is None: + self.stack_guess = 0 + + def stack_usage(self, hints, warns, prog, depth = 0): + hinted_calls = [] + if self.stack_guess: + print ' ' * depth, 'stack:', self.name, self.stack_guess, 'bytes' + + our_hints = [h for h in hints if h and h[0] == self.name] + if our_hints: + hints = [h[1:] for h in our_hints] + hinted_calls = [prog.function_by_name(h[0]) for h in hints if h] + else: + if self.name in warns: + print ' WARN: no calls hints for fn-ptr caller', self.name + + if self.calls + hinted_calls: + call_usage = max([f.stack_usage(hints, warns, prog, depth + 1) for f in self.calls + hinted_calls]) + else: + call_usage = 0 + return self.stack_guess + call_usage + +class Program: + def __init__(self): + self.functions = [] + + # sequence of tuples naming a call sequence known to occur + # this allows working out calls through pointers + self.call_hints = [] + + # function names to warn on if we don't have callees + self.call_warns = set() + + def read_elf(self, elf): + current_fn = None + + for x in subprocess.Popen(['arm-none-eabi-objdump', '-d', elf], + stdout = subprocess.PIPE).stdout: + x = x.rstrip('\n') + m = function_intro_re.match(x) + if m: + fn = Function(**m.groupdict()) + current_fn = fn + self.functions.append(fn) + + m = insn_re.match(x) + if m: + assert current_fn + current_fn.add_insn(m.groupdict()) + + def analyse(self): + for f in self.functions: + f.analyse(self) + + def function_by_name(self, name): + fns = [fn for fn in self.functions if fn.name == name] + if len(fns) == 0: + return None + elif len(fns) == 1: + return fns[0] + else: + print 'warn: more than one function named', name + return None + + def function_at_addr(self, addr): + for f in self.functions: + if f.addr == addr: + return f + return None + + def add_call_hint(self, *seq): + self.call_hints.append(seq) + + def add_call_warn(self, fn): + self.call_warns.add(fn) + + def measure_stack(self, name): + fn = self.function_by_name(name) + if fn is None: + return 0 + + return fn.stack_usage(self.call_hints, self.call_warns, self) + +_, exe, fn = sys.argv + +p = Program() +p.read_elf(exe) + +p.analyse() + +# calls which indirect through fn ptrs +p.add_call_warn('cf_blockwise_accumulate') +p.add_call_warn('cf_blockwise_accumulate_final') + +# hints to resolve those +p.add_call_hint('cf_sha224_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'sha256_update_block') +p.add_call_hint('cf_sha256_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'sha256_update_block') +p.add_call_hint('cf_sha384_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'sha512_update_block') +p.add_call_hint('cf_sha512_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'sha512_update_block') +p.add_call_hint('cf_norx32_encrypt', 'input', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'input_block') +p.add_call_hint('cf_norx32_decrypt', 'input', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'input_block') +p.add_call_hint('cf_cbcmac_stream_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'cbcmac_process') +p.add_call_hint('cf_cmac_stream_update', 'cf_blockwise_accumulate', 'cf_blockwise_accumulate_final', 'cmac_process_final_pad') +p.add_call_hint('cf_cmac_stream_update', 'cf_blockwise_accumulate_final', 'cmac_process') +p.add_call_hint('cf_cmac_stream_update', 'cf_blockwise_accumulate_final', 'cmac_process_final_nopad') + + +print 'stack', fn, '=', p.measure_stack(fn) diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/boot.c b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/boot.c new file mode 100644 index 00000000..d2a8e407 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/boot.c @@ -0,0 +1,144 @@ +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +extern int main(void); + +/* --- Defined by link script --- */ +extern uint32_t __etext; /* End of text/start of data. */ +extern uint32_t __data_start__, __data_end__; /* Data addresses in RAM */ +extern uint32_t __bss_start__, __bss_end__; /* BSS addresses in RAM */ +extern uint32_t __StackTop; /* End of stack in RAM */ + +#define ATTR_SECTION(sec) __attribute__ ((section (sec))) + +/* --- Interrupt vector table. --- */ +void Reset_Handler(void); +void SysTick_Handler(void); +void infinite_loop(void); +void do_nothing(void); + +typedef void (*vector_fn)(void); + +typedef struct { + uint32_t *stack_top; + vector_fn reset, nmi, hard_fault, mmu_fault, bus_fault, usage_fault; + vector_fn reserved0[4]; + vector_fn svc, debug_monitor; + vector_fn reserved1; + vector_fn pendsv, systick; + vector_fn irq[128]; +} vectors_t; + +#define COPY2(v) v, v +#define COPY4(v) COPY2(v), COPY2(v) +#define COPY8(v) COPY4(v), COPY4(v) +#define COPY16(v) COPY8(v), COPY8(v) +#define COPY32(v) COPY16(v), COPY16(v) +#define COPY64(v) COPY32(v), COPY32(v) +#define COPY128(v) COPY64(v), COPY64(v) + +vectors_t vectors ATTR_SECTION(".isr_vector") = { + .stack_top = &__StackTop, + .reset = Reset_Handler, + .nmi = do_nothing, + .hard_fault = infinite_loop, + .mmu_fault = infinite_loop, + .bus_fault = infinite_loop, + .usage_fault = infinite_loop, + .svc = do_nothing, + .debug_monitor = do_nothing, + .pendsv = do_nothing, + .systick = SysTick_Handler, + .irq = { COPY128(do_nothing) } +}; + +/* --- ISRs --- */ +void Reset_Handler(void) +{ + /* Copy data segment contents from flash to RAM. */ + uint32_t data_bytes = (&__data_end__ - &__data_start__) * 4; + memcpy(&__etext, &__data_start__, data_bytes); + + /* Zero BSS. */ + uint32_t bss_bytes = (&__bss_end__ - &__bss_start__) * 4; + memset(&__bss_start__, 0, bss_bytes); + + main(); + while (1) + ; +} + +void __assert_func(const char *file, int line, const char *func, const char *expr) +{ + while (1) + ; +} + +void infinite_loop(void) +{ + while (1) + ; +} + +void do_nothing(void) +{ +} + +uint32_t ticks = 0; + +void SysTick_Handler(void) +{ + ticks++; +} + +uint32_t get_ticks(void) +{ + return ticks; +} + +void reset_ticks(void) +{ + ticks = 0; +} + +void *memmove(void *vtarg, const void *vsrc, size_t len) +{ + if (vsrc > vtarg) + return memcpy(vtarg, vsrc, len); + else if (vsrc == vtarg) + return vtarg; + + uint8_t *targ = vtarg; + const uint8_t *src = vsrc; + + for (size_t i = len; i != 0; i++) + targ[i - 1] = src[i - 1]; + return vtarg; +} + +int memcmp(const void *va, const void *vb, size_t len) +{ + const uint8_t *a = va, *b = vb; + + for (size_t i = 0; i < len; i++) + { + if (a[i] != b[i]) + return a[i] < b[i] ? -1 : 1; + } + + return 0; +} + +size_t strlen(const char *c) +{ + size_t r = 0; + while (*c++) r++; + return r; +} + +void abort(void) +{ + while (1) + ; +} diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/curve25519-results.txt b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/curve25519-results.txt new file mode 100644 index 00000000..968e40e5 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/curve25519-results.txt @@ -0,0 +1,22 @@ +STM32F0 +donna-before-opt: 12907000c +donna-after-opt: 17294000c +donna-reset-opt: 12947000c +~20k + +donna -O2 -Os: 15268000c +donna -O2 -Os noasm: 20453000c +donna -Os: 15748000c +7.4k + +donna -O3: 12907000c 16KB 3380b +donna -Os: 15748000c 7.4KB 3148b +donna -O2: 15218000c 7.9KB 3148b + +tweetnacl -O2: 68876000c 3.0KB 2268b +tweetnacl -Os: 75979000c 2.8KB 2244b +tweetnacl -O3: 69622000c 8.9KB 2900b + +naclref -Os: 47813000c 3.2KB 4012b +naclref -O3: 35059000c 4.1KB 4044b +naclref -O2: 34309000c 3.5KB 4036b diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/ext/cutest.h b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/ext/cutest.h new file mode 100644 index 00000000..fa3c5d84 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/ext/cutest.h @@ -0,0 +1,55 @@ +/* cutest, for embedded targets. */ + +#ifndef CUTEST_H +#define CUTEST_H + +/* Main interface. */ +#define TEST_LIST const struct test__ test_list__[] +#define TEST_CHECK(cond) test_check__((cond), __FILE__, __LINE__, #cond) +/* no TEST_CHECK_ -- we don't have a good enough printf */ + +/* Implementation */ +#include "../semihost.h" + +struct test__ +{ + const char *name; + void (*func)(void); +}; + +extern const struct test__ test_list__[]; + +static void test_check__(int cond, const char *file, int line, const char *expr) +{ + if (cond) + return; /* pass */ + + emit("Failed!\n"); + emit("File: "); emit(file); emit("\n"); + emit("Line: "); emit_uint32(line); emit("\n"); + emit("Expr: "); emit(expr); emit("\n"); + quit_failure(); +} + +static void run_test__(const struct test__ *t) +{ + emit(" "); emit(t->name); emit(": "); + t->func(); + emit("OK\n"); +} + +int main(void) +{ + emit("Running tests:\n"); + + for (const struct test__ *t = test_list__; + t->name; + t++) + { + run_test__(t); + } + emit("Success\n"); + quit_success(); +} + +#endif diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.efm32.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.efm32.ld new file mode 100644 index 00000000..8b9a6bfd --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.efm32.ld @@ -0,0 +1,8 @@ +MEMORY +{ + FLASH (rx) : ORIGIN = 0x00000000, LENGTH = 64K + RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 8K +} + +INCLUDE linkscript.std.ld + diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.lm3s6965evb.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.lm3s6965evb.ld new file mode 100644 index 00000000..14fdac4e --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.lm3s6965evb.ld @@ -0,0 +1,7 @@ +MEMORY +{ + FLASH (rx) : ORIGIN = 0x00000000, LENGTH = 256K + RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 64K +} + +INCLUDE linkscript.std.ld diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.qemucm3.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.qemucm3.ld new file mode 100644 index 00000000..28264674 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.qemucm3.ld @@ -0,0 +1,8 @@ +MEMORY +{ + FLASH (rx) : ORIGIN = 0x00000000, LENGTH = 128K + RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 20K +} + +INCLUDE linkscript.std.ld + diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.std.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.std.ld new file mode 100644 index 00000000..c08d7bea --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.std.ld @@ -0,0 +1,172 @@ + +/* Linker script to place sections and symbol values. Should be used together + * with other linker script that defines memory regions FLASH and RAM. + * It references following symbols, which must be defined in code: + * Reset_Handler : Entry of reset handler + * + * It defines following symbols, which code can use without definition: + * __exidx_start + * __exidx_end + * __copy_table_start__ + * __copy_table_end__ + * __zero_table_start__ + * __zero_table_end__ + * __etext + * __data_start__ + * __preinit_array_start + * __preinit_array_end + * __init_array_start + * __init_array_end + * __fini_array_start + * __fini_array_end + * __data_end__ + * __bss_start__ + * __bss_end__ + * __end__ + * end + * __HeapLimit + * __StackLimit + * __StackTop + * __stack + */ +ENTRY(Reset_Handler) + +SECTIONS +{ + .text : + { + KEEP(*(.isr_vector)) + *(.text*) + + KEEP(*(.init)) + KEEP(*(.fini)) + + /* .ctors */ + *crtbegin.o(.ctors) + *crtbegin?.o(.ctors) + *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors) + *(SORT(.ctors.*)) + *(.ctors) + + /* .dtors */ + *crtbegin.o(.dtors) + *crtbegin?.o(.dtors) + *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors) + *(SORT(.dtors.*)) + *(.dtors) + + *(.rodata*) + + KEEP(*(.eh_frame*)) + } > FLASH + + .ARM.extab : + { + *(.ARM.extab* .gnu.linkonce.armextab.*) + } > FLASH + + __exidx_start = .; + .ARM.exidx : + { + *(.ARM.exidx* .gnu.linkonce.armexidx.*) + } > FLASH + __exidx_end = .; + + /* To copy multiple ROM to RAM sections, + * uncomment .copy.table section and, + * define __STARTUP_COPY_MULTIPLE in startup_ARMCMx.S */ + + .copy.table : + { + . = ALIGN(4); + __copy_table_start__ = .; + LONG (__etext) + LONG (__data_start__) + LONG (__data_end__ - __data_start__) + __copy_table_end__ = .; + } > FLASH + + + /* To clear multiple BSS sections, + * uncomment .zero.table section and, + * define __STARTUP_CLEAR_BSS_MULTIPLE in startup_ARMCMx.S */ + .zero.table : + { + . = ALIGN(4); + __zero_table_start__ = .; + LONG (__bss_start__) + LONG (__bss_end__ - __bss_start__) + __zero_table_end__ = .; + } > FLASH + + __etext = .; + + .data : AT (__etext) + { + __data_start__ = .; + *(vtable) + *(.data*) + + . = ALIGN(4); + /* preinit data */ + PROVIDE_HIDDEN (__preinit_array_start = .); + KEEP(*(.preinit_array)) + PROVIDE_HIDDEN (__preinit_array_end = .); + + . = ALIGN(4); + /* init data */ + PROVIDE_HIDDEN (__init_array_start = .); + KEEP(*(SORT(.init_array.*))) + KEEP(*(.init_array)) + PROVIDE_HIDDEN (__init_array_end = .); + + + . = ALIGN(4); + /* finit data */ + PROVIDE_HIDDEN (__fini_array_start = .); + KEEP(*(SORT(.fini_array.*))) + KEEP(*(.fini_array)) + PROVIDE_HIDDEN (__fini_array_end = .); + + KEEP(*(.jcr*)) + . = ALIGN(4); + /* All data end */ + __data_end__ = .; + + } > RAM + + .bss : + { + . = ALIGN(4); + __bss_start__ = .; + *(.bss*) + *(COMMON) + . = ALIGN(4); + __bss_end__ = .; + } > RAM + + .heap (COPY): + { + __end__ = .; + PROVIDE(end = .); + *(.heap*) + __HeapLimit = .; + } > RAM + + /* .stack_dummy section doesn't contains any symbols. It is only + * used for linker to calculate size of stack sections, and assign + * values to stack symbols later */ + .stack_dummy (COPY): + { + *(.stack*) + } > RAM + + /* Set stack top to end of RAM, and stack limit move down by + * size of stack_dummy section */ + __StackTop = ORIGIN(RAM) + LENGTH(RAM); + __StackLimit = __StackTop - SIZEOF(.stack_dummy); + PROVIDE(__stack = __StackTop); + + /* Check if data + heap + stack exceeds RAM limit */ + ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack") +} diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f0.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f0.ld new file mode 100644 index 00000000..c7a3bd85 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f0.ld @@ -0,0 +1,8 @@ +MEMORY +{ + FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 128K + RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 16K +} + +INCLUDE linkscript.std.ld + diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f1.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f1.ld new file mode 100644 index 00000000..d13f58de --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f1.ld @@ -0,0 +1,8 @@ +MEMORY +{ + FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 64K + RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 8K +} + +INCLUDE linkscript.std.ld + diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f3.ld b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f3.ld new file mode 100644 index 00000000..92eee46e --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/linkscript.stm32f3.ld @@ -0,0 +1,8 @@ +MEMORY +{ + FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 64K + RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 12K +} + +INCLUDE linkscript.std.ld + diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/main.c b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/main.c new file mode 100644 index 00000000..5b7cbf22 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/main.c @@ -0,0 +1,447 @@ +#ifndef TEST +# error You must select a function to test. +#endif + +#include "semihost.h" +#include "aes.h" +#include "hmac.h" +#include "sha2.h" +#include "sha3.h" +#include "modes.h" +#include "salsa20.h" +#include "curve25519.h" +#include "poly1305.h" +#include "norx.h" +#include "chacha20poly1305.h" + +#include <stdio.h> + +typedef void (*measure_fn)(void); +static uint32_t bracket; /* bracket mode parameter */ + +static void do_nothing(void) +{ +} + +static void stack_64w(void) +{ + volatile uint32_t words[64]; + words[0] = 0; + words[63] = 0; + (void) words[63]; +} + +static void stack_8w(void) +{ + volatile uint32_t words[8]; + words[0] = 0; + words[7] = 0; + (void) words[7]; +} + +static void hashtest_sha256(void) +{ + uint8_t hash[CF_SHA256_HASHSZ]; + cf_sha256_context ctx; + cf_sha256_init(&ctx); + cf_sha256_update(&ctx, "", 0); + cf_sha256_digest_final(&ctx, hash); +} + +static void hashtest_sha512(void) +{ + uint8_t hash[CF_SHA512_HASHSZ]; + cf_sha512_context ctx; + cf_sha512_init(&ctx); + cf_sha512_update(&ctx, "", 0); + cf_sha512_digest_final(&ctx, hash); +} + +static void hashtest_sha3_256(void) +{ + uint8_t hash[CF_SHA3_256_HASHSZ]; + cf_sha3_context ctx; + cf_sha3_256_init(&ctx); + cf_sha3_256_update(&ctx, "", 0); + cf_sha3_256_digest_final(&ctx, hash); +} + +static void hashtest_sha3_512(void) +{ + uint8_t hash[CF_SHA3_512_HASHSZ]; + cf_sha3_context ctx; + cf_sha3_512_init(&ctx); + cf_sha3_512_update(&ctx, "", 0); + cf_sha3_512_digest_final(&ctx, hash); +} + +static void aes128block_test(void) +{ + uint8_t key[16] = { 0 }, block[16] = { 0 }; + cf_aes_context ctx; + cf_aes_init(&ctx, key, sizeof key); + cf_aes_encrypt(&ctx, block, block); +} + +static void aes128sched_test(void) +{ + uint8_t key[16] = { 0 }; + cf_aes_context ctx; + cf_aes_init(&ctx, key, sizeof key); +} + +static void aes256block_test(void) +{ + uint8_t key[32] = { 0 }, block[16] = { 0 }; + cf_aes_context ctx; + cf_aes_init(&ctx, key, sizeof key); + cf_aes_encrypt(&ctx, block, block); +} + +static void aes256sched_test(void) +{ + uint8_t key[32] = { 0 }; + cf_aes_context ctx; + cf_aes_init(&ctx, key, sizeof key); +} + +static void aes128gcm_test(void) +{ + uint8_t key[16] = { 0 }; + cf_aes_context ctx; + cf_aes_init(&ctx, key, sizeof key); + + uint8_t msg[16] = { 0 }; + uint8_t aad[16] = { 0 }; + uint8_t nonce[12] = { 0 }; + uint8_t cipher[16] = { 0 }; + uint8_t tag[16] = { 0 }; + + cf_gcm_encrypt(&cf_aes, &ctx, + msg, sizeof msg, + aad, sizeof aad, + nonce, sizeof nonce, + cipher, + tag, sizeof tag); +} + +static void aes128eax_test(void) +{ + uint8_t key[16] = { 0 }; + cf_aes_context ctx; + cf_aes_init(&ctx, key, sizeof key); + + uint8_t msg[16] = { 0 }; + uint8_t aad[16] = { 0 }; + uint8_t nonce[12] = { 0 }; + uint8_t cipher[16] = { 0 }; + uint8_t tag[16] = { 0 }; + + cf_eax_encrypt(&cf_aes, &ctx, + msg, sizeof msg, + aad, sizeof aad, + nonce, sizeof nonce, + cipher, + tag, sizeof tag); +} + +static void aes128ccm_test(void) +{ + uint8_t key[16] = { 0 }; + cf_aes_context ctx; + cf_aes_init(&ctx, key, sizeof key); + + uint8_t msg[16] = { 0 }; + uint8_t aad[16] = { 0 }; + uint8_t nonce[11] = { 0 }; + uint8_t cipher[16] = { 0 }; + uint8_t tag[16] = { 0 }; + + cf_ccm_encrypt(&cf_aes, &ctx, + msg, sizeof msg, 4, + aad, sizeof aad, + nonce, sizeof nonce, + cipher, + tag, sizeof tag); +} + +static void salsa20_test(void) +{ + uint8_t key[32] = { 0 }; + uint8_t nonce[8] = { 0 }; + uint8_t msg[64] = { 0 }; + uint8_t cipher[64] = { 0 }; + + cf_salsa20_ctx ctx; + cf_salsa20_init(&ctx, key, sizeof key, nonce); + cf_salsa20_cipher(&ctx, msg, cipher, sizeof msg); +} + +static void chacha20_test(void) +{ + uint8_t key[32] = { 0 }; + uint8_t nonce[8] = { 0 }; + uint8_t msg[64] = { 0 }; + uint8_t cipher[64] = { 0 }; + + cf_chacha20_ctx ctx; + cf_chacha20_init(&ctx, key, sizeof key, nonce); + cf_chacha20_cipher(&ctx, msg, cipher, sizeof msg); +} + +static void curve25519_test(void) +{ + uint8_t secret[32] = { 1 }; + uint8_t pubkey[32]; + cf_curve25519_mul_base(pubkey, secret); +} + +static const uint8_t *mac_message = (const uint8_t *) "hello world"; +static const size_t mac_message_len = 11; + +static void poly1305_test(void) +{ + uint8_t key[32] = { 0 }, + nonce[16] = { 0 }, + encnonce[16], + mac[16]; + + cf_aes_context aes; + cf_aes_init(&aes, key, 16); + cf_aes_encrypt(&aes, nonce, encnonce); + + cf_poly1305 poly; + cf_poly1305_init(&poly, key + 16, encnonce); + cf_poly1305_update(&poly, mac_message, mac_message_len); + cf_poly1305_finish(&poly, mac); +} + +static void hmacsha256_test(void) +{ + uint8_t key[32] = { 0 }, + mac[32] = { 0 }; + + cf_hmac_ctx ctx; + cf_hmac_init(&ctx, &cf_sha256, key, sizeof key); + cf_hmac_update(&ctx, mac_message, mac_message_len); + cf_hmac_finish(&ctx, mac); +} + +static void norx_test(void) +{ + uint8_t key[16] = { 0 }; + uint8_t msg[16] = { 0 }; + uint8_t aad[16] = { 0 }; + uint8_t nonce[8] = { 0 }; + uint8_t cipher[16] = { 0 }; + uint8_t tag[16] = { 0 }; + + cf_norx32_encrypt(key, + nonce, + aad, sizeof aad, + msg, sizeof msg, + NULL, 0, + cipher, + tag); +} + +#ifndef BRACKET_MODE +# define AEADPERF_LEN 1 +#else +# define AEADPERF_LEN BRACKET_END +#endif + +static uint8_t aead_msg[AEADPERF_LEN] = { 0 }; +static uint8_t aead_cipher[AEADPERF_LEN] = { 0 }; +static uint8_t aead_aad[16] = { 0 }; +static uint8_t aead_key[32] = { 0 }; +static uint8_t aead_nonce[16] = { 0 }; +static uint8_t aead_tag[16] = { 0 }; + +static void aeadperf_norx(void) +{ + cf_norx32_encrypt(aead_key, aead_nonce, + aead_aad, sizeof aead_aad, + aead_msg, bracket, + NULL, 0, + aead_cipher, aead_tag); +} + +static void aeadperf_chacha20poly1305(void) +{ + cf_chacha20poly1305_encrypt(aead_key, aead_nonce, + aead_aad, sizeof aead_aad, + aead_msg, bracket, + aead_cipher, aead_tag); +} +static void aeadperf_aes128gcm(void) +{ + cf_aes_context ctx; + cf_aes_init(&ctx, aead_key, 16); + + cf_gcm_encrypt(&cf_aes, &ctx, + aead_msg, bracket, + aead_aad, sizeof aead_aad, + aead_nonce, 12, + aead_cipher, + aead_tag, 16); +} + +static void aeadperf_aes128ccm(void) +{ + cf_aes_context ctx; + cf_aes_init(&ctx, aead_key, 16); + + cf_ccm_encrypt(&cf_aes, &ctx, + aead_msg, bracket, + 4, + aead_aad, sizeof aead_aad, + aead_nonce, 11, + aead_cipher, + aead_tag, 16); +} + +static void aeadperf_aes128eax(void) +{ + cf_aes_context ctx; + cf_aes_init(&ctx, aead_key, 16); + + cf_eax_encrypt(&cf_aes, &ctx, + aead_msg, bracket, + aead_aad, sizeof aead_aad, + aead_nonce, 12, + aead_cipher, + aead_tag, 16); +} + +static void aeadperf_aes256gcm(void) +{ + cf_aes_context ctx; + cf_aes_init(&ctx, aead_key, 32); + + cf_gcm_encrypt(&cf_aes, &ctx, + aead_msg, bracket, + aead_aad, sizeof aead_aad, + aead_nonce, 12, + aead_cipher, + aead_tag, 16); +} + +static void aeadperf_aes256ccm(void) +{ + cf_aes_context ctx; + cf_aes_init(&ctx, aead_key, 32); + + cf_ccm_encrypt(&cf_aes, &ctx, + aead_msg, bracket, + 4, + aead_aad, sizeof aead_aad, + aead_nonce, 11, + aead_cipher, + aead_tag, 16); +} + +static void aeadperf_aes256eax(void) +{ + cf_aes_context ctx; + cf_aes_init(&ctx, aead_key, 32); + + cf_eax_encrypt(&cf_aes, &ctx, + aead_msg, bracket, + aead_aad, sizeof aead_aad, + aead_nonce, 12, + aead_cipher, + aead_tag, 16); +} + +/* Provided by linkscript */ +extern uint32_t __HeapLimit; + +#define STACK_MAGIC 0x57ac34df + +static __attribute__((noinline)) void clear_stack(void) +{ + uint32_t *stack_start = &__HeapLimit; + uint32_t ss = 0, *stack_stop = &ss; + size_t words = stack_stop - stack_start; + for (size_t i = 0; i < words; i++) + stack_start[i] = STACK_MAGIC; +} + +static __attribute__((noinline)) uint32_t measure_stack(void) +{ + uint32_t *stack_start = &__HeapLimit; + uint32_t ss, *stack_stop = &ss; + size_t words = stack_stop - stack_start; + for (size_t i = 0; i < words; i++) + if (stack_start[i] != STACK_MAGIC) + return words - i + 4; /* we used 4 words for ourselves, roughly */ + + return 0; +} + +static void measure(measure_fn fn) +{ + clear_stack(); + uint32_t start_cycles = reset_cycles(); + fn(); + uint32_t end_cycles = get_cycles(); + uint32_t stack_words = measure_stack(); + + emit("cycles = "); + emit_uint32(end_cycles - start_cycles); + emit("\n"); + emit("stack = "); + emit_uint32(stack_words << 2); + emit("\n"); +} + +#define STRING_(x) #x +#define STRING(x) STRING_(x) + +int main(void) +{ + emit(STRING(TEST) "\n"); +#ifdef BRACKET_MODE + for (bracket = BRACKET_START; bracket <= BRACKET_END; bracket += BRACKET_STEP) + { + emit("bracket = "); + emit_uint32(bracket); + emit("\n"); + measure(TEST); + } +#else + measure(TEST); +#endif + quit_success(); + + (void) bracket; + (void) do_nothing; + (void) stack_8w; + (void) stack_64w; + (void) hashtest_sha256; + (void) hashtest_sha512; + (void) hashtest_sha3_256; + (void) hashtest_sha3_512; + (void) aes128block_test; + (void) aes128sched_test; + (void) aes256block_test; + (void) aes256sched_test; + (void) aes128gcm_test; + (void) aes128eax_test; + (void) aes128ccm_test; + (void) salsa20_test; + (void) chacha20_test; + (void) curve25519_test; + (void) poly1305_test; + (void) hmacsha256_test; + (void) norx_test; + (void) aeadperf_norx; + (void) aeadperf_chacha20poly1305; + (void) aeadperf_aes128gcm; + (void) aeadperf_aes128ccm; + (void) aeadperf_aes128eax; + (void) aeadperf_aes256gcm; + (void) aeadperf_aes256ccm; + (void) aeadperf_aes256eax; +} diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memcpy.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memcpy.s new file mode 100644 index 00000000..63406fe5 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memcpy.s @@ -0,0 +1,49 @@ + .text + .syntax unified + .global memcpy + .func memcpy + .thumb_func + +memcpy: + /* on entry + * r0 = targ + * r1 = src + * r2 = len (bytes) + * on exit + * r0 = targ (unchanged) + */ + push {r0, r4, lr} + + /* If targ or src are unaligned, drop to byte + * processing. */ + mov r3, r0 + movs r4, #3 + orrs r3, r1 + ands r3, r4 + bne L_bytewise + + /* Process words */ +L_wordwise: + cmp r2, #4 + blo L_bytewise + ldr r4, [r1] + adds r1, #4 + str r4, [r0] + adds r0, #4 + subs r2, #4 + b L_wordwise + + /* Process bytes */ +L_bytewise: + cmp r2, #0 + beq L_fin + ldrb r4, [r1] + adds r1, #1 + strb r4, [r0] + adds r0, #1 + subs r2, #1 + b L_bytewise + +L_fin: + pop {r0, r4, pc} + .endfunc diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memset.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memset.s new file mode 100644 index 00000000..a5019667 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/memset.s @@ -0,0 +1,50 @@ + .text + .syntax unified + .global memset + .func memset + .thumb_func + +memset: + /* on entry + * r0 = targ + * r1 = value + * r2 = len (bytes) + * on exit + * r0 = targ (unchanged) + */ + push {r0, r4, lr} + + /* If targ is unaligned, drop to byte + * processing. */ + movs r3, #3 + ands r3, r0 + bne L_bytewise + + /* Process words */ + /* Build r4 by repeating r1. */ + uxtb r4, r1 + lsls r3, r4, #8 + orrs r4, r3 + lsls r3, r4, #16 + orrs r4, r3 + +L_wordwise: + cmp r2, #4 + blo L_bytewise + str r4, [r0] + adds r0, #4 + subs r2, #4 + b L_wordwise + + /* Process bytes */ +L_bytewise: + cmp r2, #0 + beq L_fin + strb r1, [r0] + adds r0, #1 + subs r2, #1 + b L_bytewise + +L_fin: + pop {r0, r4, pc} + .endfunc diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/merge.py b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/merge.py new file mode 100644 index 00000000..71d50895 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/merge.py @@ -0,0 +1,26 @@ +import sys + +def extract_results(results): + index = 0 + while index < len(results): + if results[index].startswith('## '): + end = results.index('\n', index) + yield results[index:end] + index += 1 + +def merge(readme, res): + title, table = res[0], res[1:] + assert title in readme, 'Section ' + title + ' missing from README.md' + secindex = readme.index(title) + hdrindex = [i for i in range(secindex, len(readme)) if readme[i].startswith('---------- | ')][0] + start = hdrindex - 1 + end = readme.index('\n', start) + table = [t.rstrip() + '\n' for t in table] + return readme[:start] + table + readme[end:] + +results = sys.stdin.readlines() +readme = open('../../README.md').readlines() + +for res in extract_results(results): + readme = merge(readme, res) +print ''.join(readme).rstrip() diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.efm32.cfg b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.efm32.cfg new file mode 100644 index 00000000..85af4733 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.efm32.cfg @@ -0,0 +1,3 @@ +source [find interface/jlink.cfg] +transport select swd +source [find target/efm32.cfg] diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f0.cfg b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f0.cfg new file mode 100644 index 00000000..e9356f75 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f0.cfg @@ -0,0 +1,3 @@ +source [find interface/stlink-v2.cfg] +transport select hla_swd +source [find target/stm32f0x.cfg] diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f1.cfg b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f1.cfg new file mode 100644 index 00000000..1108ea07 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f1.cfg @@ -0,0 +1,3 @@ +source [find interface/stlink-v2.cfg] +transport select hla_swd +source [find target/stm32f1x.cfg] diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f3.cfg b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f3.cfg new file mode 100644 index 00000000..de023b84 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/openocd.stm32f3.cfg @@ -0,0 +1,3 @@ +source [find interface/stlink-v2.cfg] +transport select hla_swd +source [find target/stm32f3x.cfg] diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/report.py b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/report.py new file mode 100644 index 00000000..718ab24e --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/report.py @@ -0,0 +1,276 @@ +""" +Interprets logs from test runs. Outputs ASCII +tables containing results, json data, etc. +""" + +import json +import sys + +archs = 'stm32f0 stm32f1 stm32f3'.split() +tests = """ +aes128block_test +aes256block_test +aes128sched_test +aes256sched_test +hashtest_sha256 +hashtest_sha512 +hashtest_sha3_256 +hashtest_sha3_512 +aes128gcm_test +aes128eax_test +aes128ccm_test +norx_test +salsa20_test +chacha20_test +poly1305_test +hmacsha256_test +curve25519_test +aeadperf_norx +aeadperf_aes128gcm +aeadperf_aes128eax +aeadperf_aes128ccm +aeadperf_aes256gcm +aeadperf_aes256eax +aeadperf_aes256ccm +aeadperf_chacha20poly1305 +do_nothing +""".split() + +arch_names = dict( + stm32f0 = 'Cortex-M0', + stm32f1 = 'Cortex-M3', + stm32f3 = 'Cortex-M4F' + ) + +base_test = 'do_nothing' + +def extract(arch, test): + fn = 'run.%s.%s.log' % (test, arch) + + code_size = 0 + data_size = 0 + cycle_count = None + stack_usage = None + brackets = None + current_bracket = None + + try: + lines = open(fn).readlines() + except IOError: + return None + + for l in lines: + if 'LOAD' in l: + parts = l.split() + assert len(parts) >= 8 + assert 'LOAD' == parts[0] + if parts[6] == 'RWE': + code_size += long(parts[5], 16) + if parts[6] == 'RW': + data_size += long(parts[5], 16) + + if l.startswith('bracket = '): + bracket = long(l.split(' = ')[1].strip(), 16) + current_bracket = bracket + if brackets is None: + brackets = {} + brackets[current_bracket] = dict() + + if l.startswith('cycles = '): + cycle_count = long(l.split(' = ')[1].strip(), 16) + if current_bracket is not None: + brackets[current_bracket]['cycle_count'] = cycle_count + + if l.startswith('stack = '): + stack_usage = long(l.split(' = ')[1].strip(), 16) + if current_bracket is not None: + brackets[current_bracket]['stack_usage'] = stack_usage + + return dict( + code_size = code_size, + data_size = data_size, + cycle_count = cycle_count, + stack_usage = stack_usage, + brackets = brackets + ) + +def print_table(rows): + header, rows = rows[0], rows[1:] + assert not [True for r in rows if len(r) != len(header)] + widths = [] + for i, h in enumerate(header): + widths.append(max([len(h)] + [len(r[i]) for r in rows])) + + def print_row(row): + print ' | '.join(c + (' ' * (widths[i] - len(c))) for i, c in enumerate(row)) + + print_row(header) + print_row(['-' * w for w in widths]) + for r in rows: + print_row(r) + +results = {} + +for arch in archs: + for test in tests: + inf = extract(arch, test) + if inf: + results.setdefault(arch, {})[test] = inf + +for arch in results.keys(): + if base_test not in results[arch]: + print 'need', base_test, 'results to report for', arch + continue + + base_result = results[arch][base_test] + + for test in results[arch].keys(): + if test == base_test: + continue + + results[arch][test]['code_size'] -= base_result['code_size'] + +def tabulate_aes(arch, block_result, sched_result, table = None): + if table is None: + table = [] + table.append(( + 'Core', + 'Cycles (key schedule + block)', + 'Cycles (key schedule)', + 'Cycles (block)', + 'Stack', + 'Code size' + )) + + table.append( + ( + arch_names[arch], + '%d' % block_result['cycle_count'], + '%d' % sched_result['cycle_count'], + '%d' % (block_result['cycle_count'] - sched_result['cycle_count']), + '%dB' % block_result['stack_usage'], + '%dB' % block_result['code_size'] + )) + + return table + +def print_std(result): + print """* **Cycles**: %(cycle_count)d +* **Stack**: %(stack_usage)dB +* **Code size**: %(code_size)dB +""" % result + +def tabulate_std(arch, result, table = None): + if table is None: + table = [] + table.append(('Core', 'Cycles', 'Stack', 'Code size')) + + table.append( + ( + arch_names[arch], + '%d' % result['cycle_count'], + '%dB' % result['stack_usage'], + '%dB' % result['code_size'] + )) + + return table + +def tabulate(mktab): + table = None + for arch in archs: + if arch not in results: + continue + table = mktab(arch, table) + print_table(table) + +def convert_brackets(metric, tests): + for arch in archs: + arch_result = {} + + # collect results for each test + for t in tests: + if arch not in results or t not in results[arch]: + print 'missing', arch, t + continue + data = results[arch][t]['brackets'] + arch_result[t] = [[b, data[b][metric]] for b in sorted(data.keys())] + + # convert into list of [bracket, test-1, test-2, ...] lists + out = [] + if len(arch_result) == 0: + continue + first_row = arch_result.values()[0] + + for i in range(len(first_row)): + row = [ first_row[i][0] ] + + for k in sorted(arch_result.keys()): + if len(arch_result[k]) != len(first_row): + print 'warn:', 'test', k, 'did not complete?' + rr = arch_result[k][i] + row.append(rr[1]) + + out.append(row) + + print json.dumps(out) + +convert_brackets('cycle_count', + [ + 'aeadperf_norx', + 'aeadperf_aes128gcm', + 'aeadperf_aes128eax', + 'aeadperf_aes128ccm', + 'aeadperf_aes256gcm', + 'aeadperf_aes256eax', + 'aeadperf_aes256ccm', + 'aeadperf_chacha20poly1305' + ]) +convert_brackets('stack_usage', + [ + 'aeadperf_norx', + 'aeadperf_aes128gcm', + 'aeadperf_aes128eax', + 'aeadperf_aes128ccm', + 'aeadperf_aes256gcm', + 'aeadperf_aes256eax', + 'aeadperf_aes256ccm', + 'aeadperf_chacha20poly1305' + ]) + +# screwed if we need other block ciphers +print '###', '128-bit key' +tabulate(lambda arch, table: tabulate_aes(arch, results[arch]['aes128block_test'], results[arch]['aes128sched_test'], table)) +print + +print '###', '256-bit key' +tabulate(lambda arch, table: tabulate_aes(arch, results[arch]['aes256block_test'], results[arch]['aes256sched_test'], table)) +print + +def do_table(title, test): + print '##', title + tabulate(lambda arch, table: tabulate_std(arch, results[arch][test], table)) + print + +do_table('AES128-GCM', 'aes128gcm_test') +do_table('AES128-EAX', 'aes128eax_test') +do_table('AES128-CCM', 'aes128ccm_test') +do_table('NORX32', 'norx_test') +do_table('ChaCha20', 'chacha20_test') +do_table('Salsa20', 'salsa20_test') +do_table('SHA256', 'hashtest_sha256') +do_table('SHA512', 'hashtest_sha512') +do_table('SHA3-256', 'hashtest_sha3_256') +do_table('SHA3-512', 'hashtest_sha3_512') +do_table('HMAC-SHA256', 'hmacsha256_test') +do_table('Poly1305-AES', 'poly1305_test') +do_table('Curve25519', 'curve25519_test') + +if '--aead' in sys.argv: + do_table('AEAD-Shootout: NORX', 'aeadperf_norx') + do_table('AEAD-Shootout: AES-128-GCM', 'aeadperf_aes128gcm') + do_table('AEAD-Shootout: AES-128-EAX', 'aeadperf_aes128eax') + do_table('AEAD-Shootout: AES-128-CCM', 'aeadperf_aes128ccm') + do_table('AEAD-Shootout: AES-256-GCM', 'aeadperf_aes256gcm') + do_table('AEAD-Shootout: AES-256-EAX', 'aeadperf_aes256eax') + do_table('AEAD-Shootout: AES-256-CCM', 'aeadperf_aes256ccm') + do_table('AEAD-Shootout: ChaCha20-Poly1305', 'aeadperf_chacha20poly1305') diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.c b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.c new file mode 100644 index 00000000..cbe5aa2e --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.c @@ -0,0 +1,170 @@ +#include <stdint.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> + +#include "semihost.h" + +#define OP_WRITE0 0x04 +#define OP_EXIT 0x18 +#define OP_EXIT_ARG_FAILURE 0x0 +#define OP_EXIT_ARG_SUCCESS 0x20026 + +extern uint32_t semihost(uint32_t, volatile void *); + +__attribute__((noreturn)) +void quit_success(void) +{ + semihost(OP_EXIT, (void *) OP_EXIT_ARG_SUCCESS); + while (1) + ; +} + +__attribute__((noreturn)) +void quit_failure(void) +{ + semihost(OP_EXIT, (void *) OP_EXIT_ARG_FAILURE); + while (1) + ; +} + +void emit(const char *buf) +{ + semihost(OP_WRITE0, (volatile void *) buf); +} + +static void emit_extent(const char *start, const char *end) +{ + char buf[32+1]; + size_t bufmax = sizeof(buf) - 1; + buf[32] = 0; + + size_t bytes = end - start + 1; + + while (bytes >= bufmax) + { + memcpy(buf, start, bufmax); + emit(buf); + bytes -= bufmax; + start += bufmax; + } + + if (bytes == 0) + return; + + memcpy(buf, start, bytes); + buf[bytes] = 0; + emit(buf); +} + +void emitf(const char *fmt, ...) +{ + const char *start = fmt, *end = fmt; + + va_list args; + va_start(args, fmt); + + while (*fmt) + { + switch (*fmt) + { + case '%': + emit_extent(start, end); + + switch (fmt[1]) + { + case '%': + emit("%"); + break; + + case 'u': + emit_uint32(va_arg(args, uint32_t)); + break; + + case 's': + emit(va_arg(args, const char *)); + break; + } + start = end = fmt + 2; + break; + + default: + end = fmt; + break; + } + + fmt++; + } + + va_end(args); + emit_extent(start, end); +} + +static const char *hex_chars = "0123456789abcdef"; + +void emit_hex(const void *ptr, size_t len) +{ + const uint8_t *bb = ptr; + char byte[3]; + + byte[2] = 0; + + for (size_t i = 0; i < len; i++) + { + byte[0] = hex_chars[(bb[i] >> 4) & 0xf]; + byte[1] = hex_chars[bb[i] & 0xf]; + emit(byte); + } +} + +void emit_uint32(uint32_t x) +{ + char buf[sizeof "0x11223344"]; + buf[0] = '0'; + buf[1] = 'x'; + buf[2] = hex_chars[(x >> 28) & 0xf]; + buf[3] = hex_chars[(x >> 24) & 0xf]; + buf[4] = hex_chars[(x >> 20) & 0xf]; + buf[5] = hex_chars[(x >> 16) & 0xf]; + buf[6] = hex_chars[(x >> 12) & 0xf]; + buf[7] = hex_chars[(x >> 8) & 0xf]; + buf[8] = hex_chars[(x >> 4) & 0xf]; + buf[9] = hex_chars[x & 0xf]; + buf[10] = 0; + + emit(buf); +} + +typedef struct +{ + volatile uint32_t ctrl; + volatile uint32_t reload; + volatile uint32_t current; +} systick; + +#define SysTick ((systick *)0xe000e010) + +#define STCTRL_SYSCLOCK 0x04 +#define STCTRL_TICKINT 0x02 +#define STCTRL_ENABLE 0x01 + +#define STCTRL_MAX 0xffffff +#define STCTRL_SHIFT 24 + +extern uint32_t get_ticks(void); +extern void reset_ticks(void); + +uint32_t reset_cycles(void) +{ + SysTick->reload = STCTRL_MAX; + SysTick->ctrl = STCTRL_SYSCLOCK | STCTRL_TICKINT | STCTRL_ENABLE; + SysTick->current = 0; + reset_ticks(); + return get_ticks(); +} + +uint32_t get_cycles(void) +{ + return (get_ticks() << STCTRL_SHIFT) + (STCTRL_MAX - SysTick->current); +} + diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.h b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.h new file mode 100644 index 00000000..cf6f01a5 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.h @@ -0,0 +1,40 @@ +#ifndef SEMIHOST_H +#define SEMIHOST_H + +#include <stdint.h> +#include <stdlib.h> + +/* Exits emulator with success (or merely hangs). */ +__attribute__((noreturn)) +void quit_success(void); + +/* Exits emulator with failure (or merely hangs). */ +__attribute__((noreturn)) +void quit_failure(void); + +/* Writes zero terminated string to debug output */ +void emit(const char *buf); + +/* Writes a formatting string to debug output. + * + * Supported: + * %u - uint32_t argument, same as emit_uint32 + * %s - const char * argument, same as emit + */ +void emitf(const char *fmt, ...); + +/* Writes hex dump of len bytes at ptr to debug output. */ +void emit_hex(const void *ptr, size_t len); + +/* Writes value v in hex to debug output, in format: + * 0xHHHHHHHH (equivalent to printf 0x%08x). */ +void emit_uint32(uint32_t v); + +/* Reset cycle counter to 0. Returns the current value + * (just after resetting it). */ +uint32_t reset_cycles(void); + +/* Return the value of the cycle counter. */ +uint32_t get_cycles(void); + +#endif diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.s new file mode 100644 index 00000000..0fddf045 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/semihost.s @@ -0,0 +1,15 @@ + .text + .syntax unified + .global semihost + .func semihost + .thumb_func + +semihost: + /* on entry + * r0 = op + * r1 = arg */ + push {r7, lr} + bkpt 0xab + pop {r7, pc} + + .endfunc diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_mpy121666.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_mpy121666.s new file mode 100644 index 00000000..49e3b5d0 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_mpy121666.s @@ -0,0 +1,199 @@ +// Implementation of multiplication of an fe25519 bit value with the curve constant 121666. +// +// B. Haase, Endress + Hauser Conducta GmbH & Ko. KG +// public domain. +// +// gnu assembler format. +// +// Generated and tested with C++ functions in the test subdirectory. +// +// ATTENTION: +// Not yet tested on target hardware. + + + .cpu cortex-m0 + .fpu softvfp + .eabi_attribute 20, 1 + .eabi_attribute 21, 1 + .eabi_attribute 23, 3 + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + .eabi_attribute 26, 1 + .eabi_attribute 30, 2 + .eabi_attribute 34, 0 + .eabi_attribute 18, 4 + .code 16 + + .file "cortex_m0_reduce25519.s" + + .text + .align 2 + + .global fe25519_mpyWith121666_asm + .code 16 + .thumb_func + .type fe25519_mpyWith121666_asm, %function + +fe25519_mpyWith121666_asm: + push {r4,r5,r6,r7,r14} + ldr r7,__label_for_immediate_56130 + ldr r2,[r1,#28] + lsl r5,r2,#16 + lsr r6,r2,#16 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r5,r2 + mov r2,#0 + adc r6,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r5,r2 + adc r6,r3 + lsl r2,r5,#1 + lsr r2,r2,#1 + str r2,[r0,#28] + lsr r5,r5,#31 + lsl r6,r6,#1 + orr r5,r6 + mov r6,#19 + mul r5,r6 + mov r6,#0 + ldr r2,[r1,#0] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r5,r3 + adc r6,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r5,r2 + mov r2,#0 + adc r6,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r5,r2 + adc r6,r3 + str r5,[r0,#0] + mov r5,#0 + ldr r2,[r1,#4] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r6,r3 + adc r5,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r6,r2 + mov r2,#0 + adc r5,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r6,r2 + adc r5,r3 + str r6,[r0,#4] + mov r6,#0 + ldr r2,[r1,#8] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r5,r3 + adc r6,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r5,r2 + mov r2,#0 + adc r6,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r5,r2 + adc r6,r3 + str r5,[r0,#8] + mov r5,#0 + ldr r2,[r1,#12] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r6,r3 + adc r5,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r6,r2 + mov r2,#0 + adc r5,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r6,r2 + adc r5,r3 + str r6,[r0,#12] + mov r6,#0 + ldr r2,[r1,#16] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r5,r3 + adc r6,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r5,r2 + mov r2,#0 + adc r6,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r5,r2 + adc r6,r3 + str r5,[r0,#16] + mov r5,#0 + ldr r2,[r1,#20] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r6,r3 + adc r5,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r6,r2 + mov r2,#0 + adc r5,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r6,r2 + adc r5,r3 + str r6,[r0,#20] + mov r6,#0 + ldr r2,[r1,#24] + lsl r3,r2,#16 + lsr r4,r2,#16 + add r5,r3 + adc r6,r4 + lsr r3,r2,#16 + uxth r2,r2 + mul r2,r7 + mul r3,r7 + add r5,r2 + mov r2,#0 + adc r6,r2 + lsl r2,r3,#16 + lsr r3,r3,#16 + add r5,r2 + adc r6,r3 + str r5,[r0,#24] + mov r5,#0 + ldr r2,[r0,#28] + add r6,r2 + str r6,[r0,#28] + pop {r4,r5,r6,r7,r15} + + .align 2 +__label_for_immediate_56130: + .word 56130 + + .size fe25519_mpyWith121666_asm, .-fe25519_mpyWith121666_asm + diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_reduce25519.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_reduce25519.s new file mode 100644 index 00000000..4c09f5ea --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/cortex_m0_reduce25519.s @@ -0,0 +1,176 @@ +// Implementation of a partial reduction modulo 2^255 - 38. +// +// B. Haase, Endress + Hauser Conducta GmbH & Ko. KG +// public domain. +// +// gnu assembler format. +// +// Generated and tested with C++ functions in the test subdirectory and on the target. +// + + .cpu cortex-m0 + .fpu softvfp + .eabi_attribute 20, 1 + .eabi_attribute 21, 1 + .eabi_attribute 23, 3 + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + .eabi_attribute 26, 1 + .eabi_attribute 30, 2 + .eabi_attribute 34, 0 + .eabi_attribute 18, 4 + .code 16 + + .file "cortex_m0_reduce25519.s" + + .text + .align 2 + + .global fe25519_reduceTo256Bits_asm + .code 16 + .thumb_func + .type fe25519_reduceTo256Bits_asm, %function + +fe25519_reduceTo256Bits_asm: + push {r4,r5,r6,r7,r14} + ldr r2,[r1,#60] + lsr r3,r2,#16 + uxth r2,r2 + mov r7,#38 + mul r2,r7 + mul r3,r7 + ldr r4,[r1,#28] + lsr r5,r3,#16 + lsl r3,r3,#16 + mov r6,#0 + add r4,r2 + adc r5,r6 + add r4,r3 + adc r5,r6 + lsl r2,r4,#1 + lsr r2,r2,#1 + str r2,[r0,#28] + lsr r4,r4,#31 + lsl r5,r5,#1 + orr r4,r5 + mov r2,#19 + mul r2,r4 + ldr r4,[r1,#0] + add r2,r4 + mov r3,#0 + adc r3,r6 + ldr r4,[r1,#32] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r2,r4 + adc r3,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + str r2,[r0,#0] + ldr r4,[r1,#4] + add r3,r4 + mov r2,#0 + adc r2,r6 + ldr r4,[r1,#36] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r3,r4 + adc r2,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r3,r4 + adc r2,r5 + str r3,[r0,#4] + ldr r4,[r1,#8] + add r2,r4 + mov r3,#0 + adc r3,r6 + ldr r4,[r1,#40] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r2,r4 + adc r3,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + str r2,[r0,#8] + ldr r4,[r1,#12] + add r3,r4 + mov r2,#0 + adc r2,r6 + ldr r4,[r1,#44] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r3,r4 + adc r2,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r3,r4 + adc r2,r5 + str r3,[r0,#12] + ldr r4,[r1,#16] + add r2,r4 + mov r3,#0 + adc r3,r6 + ldr r4,[r1,#48] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r2,r4 + adc r3,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + str r2,[r0,#16] + ldr r4,[r1,#20] + add r3,r4 + mov r2,#0 + adc r2,r6 + ldr r4,[r1,#52] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r3,r4 + adc r2,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r3,r4 + adc r2,r5 + str r3,[r0,#20] + ldr r4,[r1,#24] + add r2,r4 + mov r3,#0 + adc r3,r6 + ldr r4,[r1,#56] + lsr r5,r4,#16 + uxth r4,r4 + mul r5,r7 + mul r4,r7 + add r2,r4 + adc r3,r6 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + str r2,[r0,#24] + ldr r4,[r0,#28] + add r4,r3 + str r4,[r0,#28] + pop {r4,r5,r6,r7,r15} + + .size fe25519_reduceTo256Bits_asm, .-fe25519_reduceTo256Bits_asm + diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/mul.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/mul.s new file mode 100644 index 00000000..155674c6 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/mul.s @@ -0,0 +1,1109 @@ + .align 2 + .global multiply256x256_asm + .type multiply256x256_asm, %function +multiply256x256_asm: + push {r4-r7,lr} + mov r3, r8 + mov r4, r9 + mov r5, r10 + mov r6, r11 + push {r0-r6} + mov r12, r0 + mov r10, r2 + mov r11, r1 + mov r0,r2 + ldm r0!, {r4,r5,r6,r7} + ldm r1!, {r2,r3,r6,r7} + push {r0,r1} + /////////BEGIN LOW PART ////////////////////// + /////////MUL128///////////// + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + ////////////////////////// + mov r4, r12 + stm r4!, {r0,r1} + push {r4} + push {r0,r1} + mov r1, r10 + mov r10, r2 + ldm r1, {r0, r1, r4, r5} + mov r2, r4 + mov r7, r5 + sub r2, r0 + sbc r7, r1 + sbc r6, r6 + eor r2, r6 + eor r7, r6 + sub r2, r6 + sbc r7, r6 + push {r2, r7} + mov r2, r11 + mov r11, r3 + ldm r2, {r0, r1, r2, r3} + sub r0, r2 + sbc r1, r3 + sbc r7, r7 + eor r0, r7 + eor r1, r7 + sub r0, r7 + sbc r1, r7 + eor r7, r6 + mov r12, r7 + push {r0, r1} + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + mov r4, r10 + mov r5, r11 + eor r6, r6 + add r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r6 + mov r10, r2 + mov r11, r3 + pop {r2-r5} + push {r0, r1} + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + pop {r4, r5} + mov r6, r12 + mov r7, r12 + eor r0, r6 + eor r1, r6 + eor r2, r6 + eor r3, r6 + asr r6, r6, #1 + adc r0, r4 + adc r1, r5 + adc r4, r2 + adc r5, r3 + eor r2, r2 + adc r6,r2 + adc r7,r2 + pop {r2, r3} + mov r8, r2 + mov r9, r3 + add r2, r0 + adc r3, r1 + mov r0, r10 + mov r1, r11 + adc r4, r0 + adc r5, r1 + adc r6, r0 + adc r7, r1 + ////////END LOW PART///////////////////// + pop {r0} + stm r0!, {r2,r3} + pop {r1,r2} + push {r0} + push {r4-r7} + mov r10, r1 + mov r11, r2 + ldm r1!, {r4, r5} + ldm r2, {r2, r3} + /////////BEGIN HIGH PART//////////////// + /////////MUL128///////////// + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + push {r0,r1} + mov r1, r10 + mov r10, r2 + ldm r1, {r0, r1, r4, r5} + mov r2, r4 + mov r7, r5 + sub r2, r0 + sbc r7, r1 + sbc r6, r6 + eor r2, r6 + eor r7, r6 + sub r2, r6 + sbc r7, r6 + push {r2, r7} + mov r2, r11 + mov r11, r3 + ldm r2, {r0, r1, r2, r3} + sub r0, r2 + sbc r1, r3 + sbc r7, r7 + eor r0, r7 + eor r1, r7 + sub r0, r7 + sbc r1, r7 + eor r7, r6 + mov r12, r7 + push {r0, r1} + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + mov r4, r10 + mov r5, r11 + eor r6, r6 + add r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r6 + mov r10, r2 + mov r11, r3 + pop {r2-r5} + push {r0, r1} + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + pop {r4, r5} + mov r6, r12 + mov r7, r12 + eor r0, r6 + eor r1, r6 + eor r2, r6 + eor r3, r6 + asr r6, r6, #1 + adc r0, r4 + adc r1, r5 + adc r4, r2 + adc r5, r3 + eor r2, r2 + adc r6,r2 //0,1 + adc r7,r2 + pop {r2, r3} + mov r8, r2 + mov r9, r3 + add r2, r0 + adc r3, r1 + mov r0, r10 + mov r1, r11 + adc r4, r0 + adc r5, r1 + adc r6, r0 + adc r7, r1 + ////////END HIGH PART///////////////////// + mov r0, r8 + mov r1, r9 + mov r8, r6 + mov r9, r7 + pop {r6, r7} + add r0, r6 + adc r1, r7 + pop {r6, r7} + adc r2, r6 + adc r3, r7 + pop {r7} + stm r7!, {r0-r3} + mov r10, r7 + eor r0,r0 + mov r6, r8 + mov r7, r9 + adc r4, r0 + adc r5, r0 + adc r6, r0 + adc r7, r0 + pop {r0,r1,r2} + mov r12, r2 + push {r0, r4-r7} + ldm r1, {r0-r7} + sub r0, r4 + sbc r1, r5 + sbc r2, r6 + sbc r3, r7 + eor r4, r4 + sbc r4, r4 + eor r0, r4 + eor r1, r4 + eor r2, r4 + eor r3, r4 + sub r0, r4 + sbc r1, r4 + sbc r2, r4 + sbc r3, r4 + mov r6, r12 + mov r12, r4 //carry + mov r5, r10 + stm r5!, {r0-r3} + mov r11, r5 + mov r8, r0 + mov r9, r1 + ldm r6, {r0-r7} + sub r4, r0 + sbc r5, r1 + sbc r6, r2 + sbc r7, r3 + eor r0, r0 + sbc r0, r0 + eor r4, r0 + eor r5, r0 + eor r6, r0 + eor r7, r0 + sub r4, r0 + sbc r5, r0 + sbc r6, r0 + sbc r7, r0 + mov r1, r12 + eor r0, r1 + mov r1, r11 + stm r1!, {r4-r7} + push {r0} + mov r2, r8 + mov r3, r9 + /////////BEGIN MIDDLE PART//////////////// + /////////MUL128///////////// + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + push {r0,r1} + mov r1, r10 + mov r10, r2 + ldm r1, {r0, r1, r4, r5} + mov r2, r4 + mov r7, r5 + sub r2, r0 + sbc r7, r1 + sbc r6, r6 + eor r2, r6 + eor r7, r6 + sub r2, r6 + sbc r7, r6 + push {r2, r7} + mov r2, r11 + mov r11, r3 + ldm r2, {r0, r1, r2, r3} + sub r0, r2 + sbc r1, r3 + sbc r7, r7 + eor r0, r7 + eor r1, r7 + sub r0, r7 + sbc r1, r7 + eor r7, r6 + mov r12, r7 + push {r0, r1} + //MUL64 + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + mov r4, r10 + mov r5, r11 + eor r6, r6 + add r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r6 + mov r10, r2 + mov r11, r3 + pop {r2-r5} + push {r0, r1} + mov r6, r5 + mov r1, r2 + sub r5, r4 + sbc r0, r0 + eor r5, r0 + sub r5, r0 + sub r1, r3 + sbc r7, r7 + eor r1, r7 + sub r1, r7 + eor r7, r0 + mov r9, r1 + mov r8, r5 + lsr r1,r4,#16 + uxth r4,r4 + mov r0,r4 + uxth r5,r2 + lsr r2,#16 + mul r0,r5//00 + mul r5,r1//10 + mul r4,r2//01 + mul r1,r2//11 + lsl r2,r4,#16 + lsr r4,r4,#16 + add r0,r2 + adc r1,r4 + lsl r2,r5,#16 + lsr r4,r5,#16 + add r0,r2 + adc r1,r4 + lsr r4, r6,#16 + uxth r6, r6 + uxth r5, r3 + lsr r3, r3, #16 + mov r2, r6 + mul r2, r5 + mul r5, r4 + mul r6, r3 + mul r3, r4 + lsl r4,r5,#16 + lsr r5,r5,#16 + add r2,r4 + adc r3,r5 + lsl r4,r6,#16 + lsr r5,r6,#16 + add r2,r4 + adc r3,r5 + eor r6, r6 + add r2, r1 + adc r3, r6 + mov r1, r9 + mov r5, r8 + mov r8, r0 + lsr r0, r1,#16 + uxth r1,r1 + mov r4,r1 + lsr r6,r5,#16 + uxth r5,r5 + mul r1,r5 + mul r4,r6 + mul r5,r0 + mul r0,r6 + lsl r6,r4,#16 + lsr r4,#16 + add r1,r6 + adc r0,r4 + lsl r6,r5,#16 + lsr r5,#16 + add r1,r6 + adc r0,r5 + eor r1,r7 + eor r0,r7 + eor r4, r4 + asr r7, r7, #1 + adc r1, r2 + adc r2, r0 + adc r7, r4 + mov r0, r8 + add r1, r0 + adc r2, r3 + adc r3, r7 + pop {r4, r5} + mov r6, r12 + mov r7, r12 + eor r0, r6 + eor r1, r6 + eor r2, r6 + eor r3, r6 + asr r6, r6, #1 + adc r0, r4 + adc r1, r5 + adc r4, r2 + adc r5, r3 + eor r2, r2 + adc r6,r2 //0,1 + adc r7,r2 + pop {r2, r3} + mov r8, r2 + mov r9, r3 + add r2, r0 + adc r3, r1 + mov r0, r10 + mov r1, r11 + adc r4, r0 + adc r5, r1 + adc r6, r0 + adc r7, r1 + //////////END MIDDLE PART//////////////// + pop {r0,r1} //r0,r1 + mov r12, r0 //negative + eor r2, r0 + eor r3, r0 + eor r4, r0 + eor r5, r0 + eor r6, r0 + eor r7, r0 + push {r4-r7} + ldm r1!, {r4-r7} + mov r11, r1 //reference + mov r1, r9 + eor r1, r0 + mov r10, r4 + mov r4, r8 + asr r0, #1 + eor r0, r4 + mov r4, r10 + adc r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r7 + eor r4, r4 + adc r4, r4 + mov r10, r4 //carry + mov r4, r11 + ldm r4, {r4-r7} + add r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r7 + mov r9, r4 + mov r4, r11 + stm r4!, {r0-r3} + mov r11, r4 + pop {r0-r3} + mov r4, r9 + adc r4, r0 + adc r5, r1 + adc r6, r2 + adc r7, r3 + mov r1, #0 + adc r1, r1 + mov r0, r10 + mov r10, r1 //carry + asr r0, #1 + pop {r0-r3} + adc r4, r0 + adc r5, r1 + adc r6, r2 + adc r7, r3 + mov r8, r0 + mov r0, r11 + stm r0!, {r4-r7} + mov r11, r0 + mov r0, r8 + mov r6, r12 + mov r5, r10 + eor r4, r4 + adc r5, r6 + adc r6, r4 + add r0, r5 + adc r1, r6 + adc r2, r6 + adc r3, r6 + mov r7, r11 + stm r7!, {r0-r3} + pop {r3-r6} + mov r8, r3 + mov r9, r4 + mov r10, r5 + mov r11, r6 + pop {r4-r7,pc} + bx lr +.size multiply256x256_asm, .-multiply256x256_asm + diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/scalarmult.c b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/scalarmult.c new file mode 100644 index 00000000..488aac78 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/scalarmult.c @@ -0,0 +1,761 @@ +/* ======================= + ============================ C/C++ HEADER FILE ============================= + ======================= + + Collection of all required submodules from naclM0 required for curve25519 + scalar multiplication (not including randomization, etc.) alone. + + Library naclM0 largely bases on work avrNacl of M. Hutter and P. Schwabe. + + Will compile to the two functions + + int + crypto_scalarmult_base_curve25519( + unsigned char* q, + const unsigned char* n + ); + + int + crypto_scalarmult_curve25519 ( + unsigned char* r, + const unsigned char* s, + const unsigned char* p + ); + + Requires inttypes.h header and the four external assembly functions + + extern void + fe25519_reduceTo256Bits_asm ( + fe25519 *res, + const UN_512bitValue *in + ); + + extern void + fe25519_mpyWith121666_asm ( + fe25519* out, + const fe25519* in + ); + + extern void + multiply256x256_asm ( + UN_512bitValue* result, + const UN_256bitValue* x, + const UN_256bitValue* y + ); + + extern void + square256_asm ( + UN_512bitValue* result, + const UN_256bitValue* x + ); + + \file scalarmult.c + + \Author B. Haase, Endress + Hauser Conducta GmbH & Co. KG + + License: CC Common Creative license Attribution 4.0 International (CC BY 4.0) + http://creativecommons.org/licenses/by/4.0/ + ============================================================================*/ + +#include <inttypes.h> + +// comment out this line if implementing conditional swaps by data moves +//#define DH_SWAP_BY_POINTERS + +// Define the symbol to 0 in order to only use ladder steps +//#define DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS 1 + +typedef uint8_t uint8; +typedef uint16_t uint16; +typedef uint32_t uint32; +typedef uint64_t uint64; +typedef uintptr_t uintptr; + +typedef int8_t int8; +typedef int16_t int16; +typedef int32_t int32; +typedef int64_t int64; +typedef intptr_t intptr; + +// Note that it's important to define the unit8 as first union member, so that +// an array of uint8 may be used as initializer. +typedef union UN_256bitValue_ +{ + uint8 as_uint8[32]; + uint16 as_uint16[16]; + uint32 as_uint32[8]; + uint64 as_uint64[4]; +} UN_256bitValue; + +// Note that it's important to define the unit8 as first union member, so that +// an array of uint8 may be used as initializer. +typedef union UN_512bitValue_ +{ + uint8 as_uint8[64]; + uint16 as_uint16[32]; + uint32 as_uint32[16]; + uint64 as_uint64[8]; + UN_256bitValue as_256_bitValue[2]; +} UN_512bitValue; + +typedef UN_256bitValue fe25519; + +// **************************************************** +// Assembly functions. +// **************************************************** + +extern void +fe25519_reduceTo256Bits_asm( + fe25519 *res, + const UN_512bitValue *in +); + +#define fe25519_mpyWith121666 fe25519_mpyWith121666_asm +extern void +fe25519_mpyWith121666_asm ( + fe25519* out, + const fe25519* in +); + +#define multiply256x256 multiply256x256_asm +extern void +multiply256x256( + UN_512bitValue* result, + const UN_256bitValue* x, + const UN_256bitValue* y +); + +#define square256 square256_asm +extern void +square256( + UN_512bitValue* result, + const UN_256bitValue* x +); + +// **************************************************** +// C functions for fe25519 +// **************************************************** + +static void +fe25519_cpy( + fe25519* dest, + const fe25519* source +) +{ + uint32 ctr; + + for (ctr = 0; ctr < 8; ctr++) + { + dest->as_uint32[ctr] = source->as_uint32[ctr]; + } +} + +static void +fe25519_unpack( + volatile fe25519* out, + const unsigned char in[32] +) +{ + uint8 ctr; + + for (ctr = 0; ctr < 32; ctr++) + { + out->as_uint8[ctr] = in[ctr]; + } + out->as_uint8[31] &= 0x7f; // make sure that the last bit is cleared. +} + +static void +fe25519_sub( + fe25519* out, + const fe25519* baseValue, + const fe25519* valueToSubstract +) +{ + uint16 ctr; + int64 accu = 0; + + // First subtract the most significant word, so that we may + // reduce the result "on the fly". + accu = baseValue->as_uint32[7]; + accu -= valueToSubstract->as_uint32[7]; + + // We always set bit #31, and compensate this by subtracting 1 from the reduction + // value. + out->as_uint32[7] = ((uint32)accu) | 0x80000000ul; + + accu = 19 * ((int32)(accu >> 31) - 1); + // ^ "-1" is the compensation for the "| 0x80000000ul" above. + // This choice makes sure, that the result will be positive! + + for (ctr = 0; ctr < 7; ctr += 1) + { + accu += baseValue->as_uint32[ctr]; + accu -= valueToSubstract->as_uint32[ctr]; + + out->as_uint32[ctr] = (uint32)accu; + accu >>= 32; + } + accu += out->as_uint32[7]; + out->as_uint32[7] = (uint32)accu; +} + +static void +fe25519_add( + fe25519* out, + const fe25519* baseValue, + const fe25519* valueToAdd +) +{ + uint16 ctr = 0; + uint64 accu = 0; + + // We first add the most significant word, so that we may reduce + // "on the fly". + accu = baseValue->as_uint32[7]; + accu += valueToAdd->as_uint32[7]; + out->as_uint32[7] = ((uint32)accu) & 0x7ffffffful; + + accu = ((uint32)(accu >> 31)) * 19; + + for (ctr = 0; ctr < 7; ctr += 1) + { + accu += baseValue->as_uint32[ctr]; + accu += valueToAdd->as_uint32[ctr]; + + out->as_uint32[ctr] = (uint32)accu; + accu >>= 32; + } + accu += out->as_uint32[7]; + out->as_uint32[7] = (uint32)accu; +} + +static void +fe25519_mul( + fe25519* result, + const fe25519* in1, + const fe25519* in2 +) +{ + UN_512bitValue tmp; + + multiply256x256(&tmp, in1, in2); + fe25519_reduceTo256Bits_asm(result,&tmp); +} + +static void +fe25519_square( + fe25519* result, + const fe25519* in +) +{ + UN_512bitValue tmp; + + square256(&tmp, in); + fe25519_reduceTo256Bits_asm(result,&tmp); +} + +static void +fe25519_reduceCompletely( + volatile fe25519* inout +) +{ + uint32 numberOfTimesToSubstractPrime; + uint32 initialGuessForNumberOfTimesToSubstractPrime = inout->as_uint32[7] >> + 31; + uint64 accu; + uint8 ctr; + + // add one additional 19 to the estimated number of reductions. + // Do the calculation without writing back the results to memory. + // + // The initial guess of required numbers of reductions is based + // on bit #32 of the most significant word. + // This initial guess may be wrong, since we might have a value + // v in the range + // 2^255 - 19 <= v < 2^255 + // . After adding 19 to the value, we will be having the correct + // Number of required subtractions. + accu = initialGuessForNumberOfTimesToSubstractPrime * 19 + 19; + + for (ctr = 0; ctr < 7; ctr++) + { + accu += inout->as_uint32[ctr]; + accu >>= 32; + } + accu += inout->as_uint32[7]; + + numberOfTimesToSubstractPrime = (uint32)(accu >> 31); + + // Do the reduction. + accu = numberOfTimesToSubstractPrime * 19; + + for (ctr = 0; ctr < 7; ctr++) + { + accu += inout->as_uint32[ctr]; + inout->as_uint32[ctr] = (uint32)accu; + accu >>= 32; + } + accu += inout->as_uint32[7]; + inout->as_uint32[7] = accu & 0x7ffffffful; +} + +/// We are already using a packed radix 16 representation for fe25519. The real use for this function +/// is for architectures that use more bits for storing a fe25519 in a representation where multiplication +/// may be calculated more efficiently. +/// Here we simply copy the data. +static void +fe25519_pack( + unsigned char out[32], + volatile fe25519* in +) +{ + uint8 ctr; + + fe25519_reduceCompletely(in); + + for (ctr = 0; ctr < 32; ctr++) + { + out[ctr] = in->as_uint8[ctr]; + } +} + +// Note, that r and x are allowed to overlap! +static void +fe25519_invert_useProvidedScratchBuffers( + fe25519* r, + const fe25519* x, + fe25519* t0, + fe25519* t1, + fe25519* t2 +) +{ + fe25519 *z11 = r; // store z11 in r (in order to save one temporary). + fe25519 *z2_10_0 = t1; + fe25519 *z2_50_0 = t2; + fe25519 *z2_100_0 = z2_10_0; + + uint8 i; + + { + fe25519 *z2 = z2_50_0; + + /* 2 */ fe25519_square(z2, x); + /* 4 */ fe25519_square(t0, z2); + /* 8 */ fe25519_square(t0, t0); + /* 9 */ fe25519_mul(z2_10_0, t0, x); + /* 11 */ fe25519_mul(z11, z2_10_0, z2); + + // z2 is dead. + } + + /* 22 */ fe25519_square(t0, z11); + /* 2^5 - 2^0 = 31 */ fe25519_mul(z2_10_0, t0, z2_10_0); + + /* 2^6 - 2^1 */ fe25519_square(t0, z2_10_0); + /* 2^7 - 2^2 */ fe25519_square(t0, t0); + /* 2^8 - 2^3 */ fe25519_square(t0, t0); + /* 2^9 - 2^4 */ fe25519_square(t0, t0); + /* 2^10 - 2^5 */ fe25519_square(t0, t0); + /* 2^10 - 2^0 */ fe25519_mul(z2_10_0, t0, z2_10_0); + + /* 2^11 - 2^1 */ fe25519_square(t0, z2_10_0); + + /* 2^20 - 2^10 */ for (i = 1; i < 10; i ++) + { + fe25519_square(t0, t0); + } + /* 2^20 - 2^0 */ fe25519_mul(z2_50_0, t0, z2_10_0); + + /* 2^21 - 2^1 */ fe25519_square(t0, z2_50_0); + + /* 2^40 - 2^20 */ for (i = 1; i < 20; i ++) + { + fe25519_square(t0, t0); + } + /* 2^40 - 2^0 */ fe25519_mul(t0, t0, z2_50_0); + + /* 2^41 - 2^1 */ fe25519_square(t0, t0); + + /* 2^50 - 2^10 */ for (i = 1; i < 10; i ++) + { + fe25519_square(t0, t0); + } + /* 2^50 - 2^0 */ fe25519_mul(z2_50_0, t0, z2_10_0); + + /* 2^51 - 2^1 */ fe25519_square(t0, z2_50_0); + + /* 2^100 - 2^50 */ for (i = 1; i < 50; i ++) + { + fe25519_square(t0, t0); + } + /* 2^100 - 2^0 */ fe25519_mul(z2_100_0, t0, z2_50_0); + + /* 2^101 - 2^1 */ fe25519_square(t0, z2_100_0); + + /* 2^200 - 2^100 */ for (i = 1; i < 100; i ++) + { + fe25519_square(t0, t0); + } + /* 2^200 - 2^0 */ fe25519_mul(t0, t0, z2_100_0); + + /* 2^250 - 2^50 */ for (i = 0; i < 50; i ++) + { + fe25519_square(t0, t0); + } + /* 2^250 - 2^0 */ fe25519_mul(t0, t0, z2_50_0); + + /* 2^255 - 2^5 */ for (i = 0; i < 5; i ++) + { + fe25519_square(t0, t0); + } + /* 2^255 - 21 */ fe25519_mul(r, t0, z11); +} + +static void +fe25519_setzero( + fe25519* out +) +{ + uint8 ctr; + + for (ctr = 0; ctr < 8; ctr++) + { + out->as_uint32[ctr] = 0; + } +} + +static void +fe25519_setone( + fe25519* out +) +{ + uint8 ctr; + + out->as_uint32[0] = 1; + + for (ctr = 1; ctr < 8; ctr++) + { + out->as_uint32[ctr] = 0; + } +} + +/* +static void +swapPointersConditionally (void **p1, void **p2, uint8 condition) +{ + // Secure version of this code: + // + // if (condition) + // { + // void *temp; + // temp = *p2; + // *p2 = *p1; + // *p1 = temp; + // } + + uintptr mask = condition; + uintptr val1 = (uintptr) *p1; + uintptr val2 = (uintptr) *p2; + uintptr temp = val2 ^ val1; + + mask = (uintptr)( - (intptr) mask ); + temp ^= mask & (temp ^ val1); + val1 ^= mask & (val1 ^ val2); + val2 ^= mask & (val2 ^ temp); + + *p1 = (void *) val1; + *p2 = (void *) val2; +} +*/ + +static void +fe25519_cswap( + fe25519* in1, + fe25519* in2, + int condition +) +{ + int32 mask = condition; + uint32 ctr; + + mask = -mask; + + for (ctr = 0; ctr < 8; ctr++) + { + uint32 val1 = in1->as_uint32[ctr]; + uint32 val2 = in2->as_uint32[ctr]; + uint32 temp = val1; + + val1 ^= mask & (val2 ^ val1); + val2 ^= mask & (val2 ^ temp); + + + in1->as_uint32[ctr] = val1; + in2->as_uint32[ctr] = val2; + } +} + +// **************************************************** +// Scalarmultiplication implementation. +// **************************************************** + +typedef struct _ST_curve25519ladderstepWorkingState +{ + // The base point in affine coordinates + fe25519 x0; + + // The two working points p, q, in projective coordinates. Possibly randomized. + fe25519 xp; + fe25519 zp; + fe25519 xq; + fe25519 zq; + + volatile UN_256bitValue s; + + int nextScalarBitToProcess; + uint8 previousProcessedBit; + +#ifdef DH_SWAP_BY_POINTERS + fe25519 *pXp; + fe25519 *pZp; + fe25519 *pXq; + fe25519 *pZq; +#endif + +} ST_curve25519ladderstepWorkingState; + +static void +curve25519_ladderstep( + ST_curve25519ladderstepWorkingState* pState +) +{ + // Implements the "ladd-1987-m-3" differential-addition-and-doubling formulas + // Source: 1987 Montgomery "Speeding the Pollard and elliptic curve methods of factorization", page 261, + // fifth and sixth displays, plus common-subexpression elimination. + // + // Notation from the explicit formulas database: + // (X2,Z2) corresponds to (xp,zp), + // (X3,Z3) corresponds to (xq,zq) + // Result (X4,Z4) (X5,Z5) expected in (xp,zp) and (xq,zq) + // + // A = X2+Z2; AA = A^2; B = X2-Z2; BB = B^2; E = AA-BB; C = X3+Z3; D = X3-Z3; + // DA = D*A; CB = C*B; t0 = DA+CB; t1 = t0^2; X5 = Z1*t1; t2 = DA-CB; + // t3 = t2^2; Z5 = X1*t3; X4 = AA*BB; t4 = a24*E; t5 = BB+t4; Z4 = E*t5 ; + // + // Re-Ordered for using less temporaries. + + fe25519 t1, t2; + + #ifdef DH_SWAP_BY_POINTERS + fe25519 *b1=pState->pXp; fe25519 *b2=pState->pZp; + fe25519 *b3=pState->pXq; fe25519 *b4=pState->pZq; + #else + fe25519 *b1=&pState->xp; fe25519 *b2=&pState->zp; + fe25519 *b3=&pState->xq; fe25519 *b4=&pState->zq; + #endif + + fe25519 *b5= &t1; fe25519 *b6=&t2; + + fe25519_add(b5,b1,b2); // A = X2+Z2 + fe25519_sub(b6,b1,b2); // B = X2-Z2 + fe25519_add(b1,b3,b4); // C = X3+Z3 + fe25519_sub(b2,b3,b4); // D = X3-Z3 + fe25519_mul(b3,b2,b5); // DA= D*A + fe25519_mul(b2,b1,b6); // CB= C*B + fe25519_add(b1,b2,b3); // T0= DA+CB + fe25519_sub(b4,b3,b2); // T2= DA-CB + fe25519_square(b3,b1); // X5==T1= T0^2 + fe25519_square(b1,b4); // T3= t2^2 + fe25519_mul(b4,b1,&pState->x0); // Z5=X1*t3 + fe25519_square(b1,b5); // AA=A^2 + fe25519_square(b5,b6); // BB=B^2 + fe25519_sub(b2,b1,b5); // E=AA-BB + fe25519_mul(b1,b5,b1); // X4= AA*BB + fe25519_mpyWith121666 (b6,b2); // T4 = a24*E + fe25519_add(b6,b6,b5); // T5 = BB + t4 + fe25519_mul(b2,b6,b2); // Z4 = E*t5 +} + +static void +curve25519_cswap( + ST_curve25519ladderstepWorkingState* state, + uint8 b +) +{ + #ifdef DH_SWAP_BY_POINTERS + swapPointersConditionally ((void **) &state->pXp,(void **) &state->pXq,b); + swapPointersConditionally ((void **) &state->pZp,(void **) &state->pZq,b); + #else + fe25519_cswap (&state->xp, &state->xq,b); + fe25519_cswap (&state->zp, &state->zq,b); + #endif +} + +#if DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS + +static void +curve25519_doublePointP (ST_curve25519ladderstepWorkingState* pState) +{ + // Implement the doubling formula "dbl-1987-m-3" + // from 1987 Montgomery "Speeding the Pollard and elliptic curve methods of factorization", + // page 261, sixth display, plus common-subexpression elimination. + // + // Three operand code: + // A = X1+Z1 + // AA = A^2 + // B = X1-Z1 + // BB = B^2 + // C = AA-BB + // X3 = AA*BB + // t0 = a24*C + // t1 = BB+t0 + // Z3 = C*t1 + + // Double the point input in the state variable "P". Use the State variable "Q" as temporary + // for storing A, AA and B, BB. Use the same temporary variable for A and AA respectively and + // B, BB respectively. + #ifdef DH_SWAP_BY_POINTERS + fe25519 *pA = pState->pXq; + fe25519 *pB = pState->pZq; + fe25519 *pX = pState->pXp; + fe25519 *pZ = pState->pZp; + #else + fe25519 *pA = &pState->xq; + fe25519 *pB = &pState->zq; + fe25519 *pX = &pState->xp; + fe25519 *pZ = &pState->zp; + #endif + + // A = X1+Z1 + fe25519_add(pA, pX, pZ); + // AA = A^2 + fe25519_square (pA,pA); + // B = X1-Z1 + fe25519_sub(pB, pX, pZ); + // BB = B^2 + fe25519_square (pB,pB); + // X3 = AA*BB + fe25519_mul (pX,pA,pB); + // C = AA-BB + fe25519_sub (pZ,pA,pB); + // t0 = a24*C + fe25519_mpyWith121666 (pA,pZ); + // t1 = BB+t0 + fe25519_add (pB,pA,pB); + // Z3 = C*t1 + fe25519_mul (pZ,pZ,pB); +} + +#endif // #ifdef DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS + +int +crypto_scalarmult_curve25519( + unsigned char* r, + const unsigned char* s, + const unsigned char* p +) +{ + ST_curve25519ladderstepWorkingState state; + unsigned char i; + + + // Prepare the scalar within the working state buffer. + for (i = 0; i < 32; i++) + { + state.s.as_uint8 [i] = s[i]; + } +#if DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS + // Due to explicit final doubling for the last three bits instead of a full ladderstep, + // the following line is no longer necessary. +#else + state.s.as_uint8 [0] &= 248; +#endif + state.s.as_uint8 [31] &= 127; + state.s.as_uint8 [31] |= 64; + + // Copy the affine x-axis of the base point to the state. + fe25519_unpack (&state.x0, p); + + // Prepare the working points within the working state struct. + + fe25519_setone (&state.zq); + fe25519_cpy (&state.xq, &state.x0); + + fe25519_setone(&state.xp); + fe25519_setzero(&state.zp); + + state.nextScalarBitToProcess = 254; + +#ifdef DH_SWAP_BY_POINTERS + // we need to initially assign the pointers correctly. + state.pXp = &state.xp; + state.pZp = &state.zp; + state.pXq = &state.xq; + state.pZq = &state.zq; +#endif + + state.previousProcessedBit = 0; + +#if DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS + // Process all the bits except for the last three where we explicitly double the result. + while (state.nextScalarBitToProcess >= 3) +#else + // Process all the bits except for the last three where we explicitly double the result. + while (state.nextScalarBitToProcess >= 0) +#endif + { + uint8 byteNo = state.nextScalarBitToProcess >> 3; + uint8 bitNo = state.nextScalarBitToProcess & 7; + uint8 bit; + uint8 swap; + + bit = 1 & (state.s.as_uint8 [byteNo] >> bitNo); + swap = bit ^ state.previousProcessedBit; + state.previousProcessedBit = bit; + curve25519_cswap(&state, swap); + curve25519_ladderstep(&state); + state.nextScalarBitToProcess --; + } + + curve25519_cswap(&state,state.previousProcessedBit); + +#if DH_REPLACE_LAST_THREE_LADDERSTEPS_WITH_DOUBLINGS + curve25519_doublePointP (&state); + curve25519_doublePointP (&state); + curve25519_doublePointP (&state); +#endif + +#ifdef DH_SWAP_BY_POINTERS + // optimize for stack usage. + fe25519_invert_useProvidedScratchBuffers (state.pZp, state.pZp, state.pXq,state.pZq,&state.x0); + fe25519_mul(state.pXp, state.pXp, state.pZp); + fe25519_reduceCompletely(state.pXp); + + fe25519_pack (r, state.pXp); +#else + // optimize for stack usage. + fe25519_invert_useProvidedScratchBuffers (&state.zp, &state.zp, &state.xq, &state.zq, &state.x0); + fe25519_mul(&state.xp, &state.xp, &state.zp); + fe25519_reduceCompletely(&state.xp); + + fe25519_pack (r, &state.xp); +#endif + + return 0; +} + +int +crypto_scalarmult_curve25519_base( + unsigned char* q, + const unsigned char* n +) +{ + static const uint8 base[32] = + { + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + + return crypto_scalarmult_curve25519(q, n, base); +} diff --git a/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/sqr.s b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/sqr.s new file mode 100644 index 00000000..3b190c92 --- /dev/null +++ b/web/server/h2o/libh2o/deps/picotls/deps/cifra/src/arm/unacl/sqr.s @@ -0,0 +1,777 @@ + .align 2 + .global square256_asm + .type square256_asm, %function +square256_asm: + push {r4-r7,lr} + mov r2, r8 + mov r3, r9 + mov r4, r10 + mov r5, r11 + push {r0-r5} + + mov r12, r0 + mov r4, r1 + ldm r4!, {r0-r3} + push {r4} + /////////BEGIN LOW PART ////////////////////// + ///SQR 128, in r0-r3 + mov r8, r2 + mov r9, r3 + eor r4, r4 + sub r2, r0 + sbc r3, r1 + sbc r4, r4 + eor r2, r4 + eor r3, r4 + sub r2, r4 + sbc r3, r4 + mov r10, r2 + mov r11, r3 + //SQR64, in: r0, r1, out: r0-r3, used: r0-r6 + mov r2, r0 + eor r3, r3 + sub r2, r1 + sbc r3, r3 + eor r2, r3 + sub r2, r3 + lsr r3, r0, #16 + uxth r0, r0 + mov r4, r0 + mul r4, r3 + mul r0, r0 + mul r3, r3 + lsr r5, r4, #16 + lsl r4, #16 + add r0, r4 + adc r3, r5 + add r0, r4 + adc r3, r5 + lsr r4, r1, #16 + uxth r1, r1 + mov r5, r1 + mul r5, r4 + mul r1, r1 + mul r4, r4 + eor r6, r6 + add r1, r3 + adc r4, r6 + lsr r3, r5, #16 + lsl r5, r5, #16 + add r1, r5 + adc r4, r3 + add r1, r5 + adc r3, r4 + lsr r4, r2, #16 + uxth r2, r2 + mov r5, r2 + mul r5, r4 + mul r2, r2 + mul r4, r4 + lsr r6, r5, #16 + lsl r5, #16 + add r2, r5 + adc r4, r6 + add r5, r2 + adc r6, r4 + eor r7, r7 + mov r2, r1 + sub r1, r5 + sbc r2, r6 + sbc r7, r7 + add r1, r0 + adc r2, r3 + adc r7, r3 + mov r3, r12 + stm r3!, {r0-r1} + push {r3} + + mov r12, r0 + mov r0, r8 + mov r8, r1 + mov r1, r9 + mov r9, r2 + //SQR64, in: r0, r1, out: r0-r3, used: r0-r6 + mov r2, r0 + eor r3, r3 + sub r2, r1 + sbc r3, r3 + eor r2, r3 + sub r2, r3 + lsr r3, r0, #16 + uxth r0, r0 + mov r4, r0 + mul r4, r3 + mul r0, r0 + mul r3, r3 + lsr r5, r4, #16 + lsl r4, #16 + add r0, r4 + adc r3, r5 + add r0, r4 + adc r3, r5 + lsr r4, r1, #16 + uxth r1, r1 + mov r5, r1 + mul r5, r4 + mul r1, r1 + mul r4, r4 + eor r6, r6 + add r1, r3 + adc r4, r6 + lsr r3, r5, #16 + lsl r5, r5, #16 + add r1, r5 + adc r4, r3 + add r1, r5 + adc r3, r4 + lsr r4, r2, #16 + uxth r2, r2 + mov r5, r2 + mul r5, r4 + mul r2, r2 + mul r4, r4 + lsr r6, r5, #16 + lsl r5, #16 + add r2, r5 + adc r4, r6 + add r5, r2 + adc r6, r4 + eor r4, r4 + mov r2, r1 + sub r1, r5 + sbc r2, r6 + sbc r4, r4 + add r1, r0 + adc r2, r3 + adc r3, r4 + eor r4, r4 + mov r6, r9 + add r0, r6 + adc r7, r1 + adc r2, r4 + adc r3, r4 + mov r1, r11 + mov r11, r0 + mov r0, r10 + mov r9, r2 + mov r10,r3 + //SQR64, in: r0, r1, out: r0-r3, used: r0-r6 + mov r2, r0 + eor r3, r3 + sub r2, r1 + sbc r3, r3 + eor r2, r3 + sub r2, r3 + lsr r3, r0, #16 + uxth r0, r0 + mov r4, r0 + mul r4, r3 + mul r0, r0 + mul r3, r3 + lsr r5, r4, #16 + lsl r4, #16 + add r0, r4 + adc r3, r5 + add r0, r4 + adc r3, r5 + lsr r4, r1, #16 + uxth r1, r1 + mov r5, r1 + mul r5, r4 + mul r1, r1 + mul r4, r4 + eor r6, r6 + add r1, r3 + adc r4, r6 + lsr r3, r5, #16 + lsl r5, r5, #16 + add r1, r5 + adc r4, r3 + add r1, r5 + adc r3, r4 + lsr r4, r2, #16 + uxth r2, r2 + mov r5, r2 + mul r5, r4 + mul r2, r2 + mul r4, r4 + lsr r6, r5, #16 + lsl r5, #16 + add r2, r5 + adc r4, r6 + add r5, r2 + adc r6, r4 + eor r4, r4 + mov r2, r1 + sub r1, r5 + sbc r2, r6 + sbc r4, r4 + add r1, r0 + adc r2, r3 + adc r3, r4 + mov r6, r11 + mov r4, r11 + mov r5, r7 + sub r6, r0 + sbc r7, r1 + sbc r4, r2 + sbc r5, r3 + eor r1, r1 + sbc r1, r1 + mov r2, r12 + mov r3, r8 + add r2, r6 + adc r3, r7 + mov r6, r9 + mov r7, r10 + adc r4, r6 + adc r5, r7 + adc r6, r1 + adc r7, r1 + //results r12, r8, r2-r7 + /////////END LOW PART //////////////////////// + pop {r0,r1} + stm r0!, {r2, r3} + push {r0, r4-r7} + ldm r1, {r0-r3} + /////////BEGIN HIGH PART ////////////////////// + ///SQR 128, in r0-r3 + mov r8, r2 + mov r9, r3 + eor r4, r4 + sub r2, r0 + sbc r3, r1 + sbc r4, r4 + eor r2, r4 + eor r3, r4 + sub r2, r4 + sbc r3, r4 + mov r10, r2 + mov r11, r3 + //SQR64, in: r0, r1, out: r0-r3, used: r0-r6 + mov r2, r0 + eor r3, r3 + sub r2, r1 + sbc r3, r3 + eor r2, r3 + sub r2, r3 + lsr r3, r0, #16 + uxth r0, r0 + mov r4, r0 + mul r4, r3 + mul r0, r0 + mul r3, r3 + lsr r5, r4, #16 + lsl r4, #16 + add r0, r4 + adc r3, r5 + add r0, r4 + adc r3, r5 + lsr r4, r1, #16 + uxth r1, r1 + mov r5, r1 + mul r5, r4 + mul r1, r1 + mul r4, r4 + eor r6, r6 + add r1, r3 + adc r4, r6 + lsr r3, r5, #16 + lsl r5, r5, #16 + add r1, r5 + adc r4, r3 + add r1, r5 + adc r3, r4 + lsr r4, r2, #16 + uxth r2, r2 + mov r5, r2 + mul r5, r4 + mul r2, r2 + mul r4, r4 + lsr r6, r5, #16 + lsl r5, #16 + add r2, r5 + adc r4, r6 + add r5, r2 + adc r6, r4 + eor r7, r7 + mov r2, r1 + sub r1, r5 + sbc r2, r6 + sbc r7, r7 + add r1, r0 + adc r2, r3 + adc r7, r3 + mov r12, r0 + mov r0, r8 + mov r8, r1 + mov r1, r9 + mov r9, r2 + //SQR64, in: r0, r1, out: r0-r3, used: r0-r6 + mov r2, r0 + eor r3, r3 + sub r2, r1 + sbc r3, r3 + eor r2, r3 + sub r2, r3 + lsr r3, r0, #16 + uxth r0, r0 + mov r4, r0 + mul r4, r3 + mul r0, r0 + mul r3, r3 + lsr r5, r4, #16 + lsl r4, #16 + add r0, r4 + adc r3, r5 + add r0, r4 + adc r3, r5 + lsr r4, r1, #16 + uxth r1, r1 + mov r5, r1 + mul r5, r4 + mul r1, r1 + mul r4, r4 + eor r6, r6 + add r1, r3 + adc r4, r6 + lsr r3, r5, #16 + lsl r5, r5, #16 + add r1, r5 + adc r4, r3 + add r1, r5 + adc r3, r4 + lsr r4, r2, #16 + uxth r2, r2 + mov r5, r2 + mul r5, r4 + mul r2, r2 + mul r4, r4 + lsr r6, r5, #16 + lsl r5, #16 + add r2, r5 + adc r4, r6 + add r5, r2 + adc r6, r4 + eor r4, r4 + mov r2, r1 + sub r1, r5 + sbc r2, r6 + sbc r4, r4 + add r1, r0 + adc r2, r3 + adc r3, r4 + eor r4, r4 + mov r6, r9 + add r0, r6 + adc r7, r1 + adc r2, r4 + adc r3, r4 + mov r1, r11 + mov r11, r0 + mov r0, r10 + mov r9, r2 + mov r10,r3 + //SQR64, in: r0, r1, out: r0-r3, used: r0-r6 + mov r2, r0 + eor r3, r3 + sub r2, r1 + sbc r3, r3 + eor r2, r3 + sub r2, r3 + lsr r3, r0, #16 + uxth r0, r0 + mov r4, r0 + mul r4, r3 + mul r0, r0 + mul r3, r3 + lsr r5, r4, #16 + lsl r4, #16 + add r0, r4 + adc r3, r5 + add r0, r4 + adc r3, r5 + lsr r4, r1, #16 + uxth r1, r1 + mov r5, r1 + mul r5, r4 + mul r1, r1 + mul r4, r4 + eor r6, r6 + add r1, r3 + adc r4, r6 + lsr r3, r5, #16 + lsl r5, r5, #16 + add r1, r5 + adc r4, r3 + add r1, r5 + adc r3, r4 + lsr r4, r2, #16 + uxth r2, r2 + mov r5, r2 + mul r5, r4 + mul r2, r2 + mul r4, r4 + lsr r6, r5, #16 + lsl r5, #16 + add r2, r5 + adc r4, r6 + add r5, r2 + adc r6, r4 + eor r4, r4 + mov r2, r1 + sub r1, r5 + sbc r2, r6 + sbc r4, r4 + add r1, r0 + adc r2, r3 + adc r3, r4 + mov r6, r11 + mov r4, r11 + mov r5, r7 + sub r6, r0 + sbc r7, r1 + sbc r4, r2 + sbc r5, r3 + eor r1, r1 + sbc r1, r1 + mov r2, r12 + mov r3, r8 + add r2, r6 + adc r3, r7 + mov r6, r9 + mov r7, r10 + adc r4, r6 + adc r5, r7 + adc r6, r1 + adc r7, r1 + //results r12, r8, r2-r7 + /////////END HIGH PART //////////////////////// + mov r0, r12 + mov r1, r8 + mov r8, r4 + mov r9, r5 + mov r10, r6 + mov r11, r7 + pop {r4} + mov r12, r4//str + pop {r4-r7} + add r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r7 + mov r4, r12 + stm r4!, {r0-r3}//low part + mov r4, r8 + mov r5, r9 + mov r6, r10 + mov r7, r11 + eor r0, r0 + adc r4, r0 + adc r5, r0 + adc r6, r0 + adc r7, r0 + pop {r0, r1} //r0->out, r1, in + push {r0,r4-r7} + ldm r1, {r0-r7} + sub r0, r4 + sbc r1, r5 + sbc r2, r6 + sbc r3, r7 + sbc r4, r4 + eor r0, r4 + eor r1, r4 + eor r2, r4 + eor r3, r4 + sub r0, r4 + sbc r1, r4 + sbc r2, r4 + sbc r3, r4 + //////////BEGIN MIDDLE PART//////////////// + ///SQR 128, in r0-r3 + mov r8, r2 + mov r9, r3 + eor r4, r4 + sub r2, r0 + sbc r3, r1 + sbc r4, r4 + eor r2, r4 + eor r3, r4 + sub r2, r4 + sbc r3, r4 + mov r10, r2 + mov r11, r3 + //SQR64, in: r0, r1, out: r0-r3, used: r0-r6 + mov r2, r0 + eor r3, r3 + sub r2, r1 + sbc r3, r3 + eor r2, r3 + sub r2, r3 + lsr r3, r0, #16 + uxth r0, r0 + mov r4, r0 + mul r4, r3 + mul r0, r0 + mul r3, r3 + lsr r5, r4, #16 + lsl r4, #16 + add r0, r4 + adc r3, r5 + add r0, r4 + adc r3, r5 + lsr r4, r1, #16 + uxth r1, r1 + mov r5, r1 + mul r5, r4 + mul r1, r1 + mul r4, r4 + eor r6, r6 + add r1, r3 + adc r4, r6 + lsr r3, r5, #16 + lsl r5, r5, #16 + add r1, r5 + adc r4, r3 + add r1, r5 + adc r3, r4 + lsr r4, r2, #16 + uxth r2, r2 + mov r5, r2 + mul r5, r4 + mul r2, r2 + mul r4, r4 + lsr r6, r5, #16 + lsl r5, #16 + add r2, r5 + adc r4, r6 + add r5, r2 + adc r6, r4 + eor r7, r7 + mov r2, r1 + sub r1, r5 + sbc r2, r6 + sbc r7, r7 + add r1, r0 + adc r2, r3 + adc r7, r3 + mov r12, r0 + mov r0, r8 + mov r8, r1 + mov r1, r9 + mov r9, r2 + //SQR64, in: r0, r1, out: r0-r3, used: r0-r6 + mov r2, r0 + eor r3, r3 + sub r2, r1 + sbc r3, r3 + eor r2, r3 + sub r2, r3 + lsr r3, r0, #16 + uxth r0, r0 + mov r4, r0 + mul r4, r3 + mul r0, r0 + mul r3, r3 + lsr r5, r4, #16 + lsl r4, #16 + add r0, r4 + adc r3, r5 + add r0, r4 + adc r3, r5 + lsr r4, r1, #16 + uxth r1, r1 + mov r5, r1 + mul r5, r4 + mul r1, r1 + mul r4, r4 + eor r6, r6 + add r1, r3 + adc r4, r6 + lsr r3, r5, #16 + lsl r5, r5, #16 + add r1, r5 + adc r4, r3 + add r1, r5 + adc r3, r4 + lsr r4, r2, #16 + uxth r2, r2 + mov r5, r2 + mul r5, r4 + mul r2, r2 + mul r4, r4 + lsr r6, r5, #16 + lsl r5, #16 + add r2, r5 + adc r4, r6 + add r5, r2 + adc r6, r4 + eor r4, r4 + mov r2, r1 + sub r1, r5 + sbc r2, r6 + sbc r4, r4 + add r1, r0 + adc r2, r3 + adc r3, r4 + eor r4, r4 + mov r6, r9 + add r0, r6 + adc r7, r1 + adc r2, r4 + adc r3, r4 + mov r1, r11 + mov r11, r0 + mov r0, r10 + mov r9, r2 + mov r10,r3 + //SQR64, in: r0, r1, out: r0-r3, used: r0-r6 + mov r2, r0 + eor r3, r3 + sub r2, r1 + sbc r3, r3 + eor r2, r3 + sub r2, r3 + lsr r3, r0, #16 + uxth r0, r0 + mov r4, r0 + mul r4, r3 + mul r0, r0 + mul r3, r3 + lsr r5, r4, #16 + lsl r4, #16 + add r0, r4 + adc r3, r5 + add r0, r4 + adc r3, r5 + lsr r4, r1, #16 + uxth r1, r1 + mov r5, r1 + mul r5, r4 + mul r1, r1 + mul r4, r4 + eor r6, r6 + add r1, r3 + adc r4, r6 + lsr r3, r5, #16 + lsl r5, r5, #16 + add r1, r5 + adc r4, r3 + add r1, r5 + adc r3, r4 + lsr r4, r2, #16 + uxth r2, r2 + mov r5, r2 + mul r5, r4 + mul r2, r2 + mul r4, r4 + lsr r6, r5, #16 + lsl r5, #16 + add r2, r5 + adc r4, r6 + add r5, r2 + adc r6, r4 + eor r4, r4 + mov r2, r1 + sub r1, r5 + sbc r2, r6 + sbc r4, r4 + add r1, r0 + adc r2, r3 + adc r3, r4 + mov r6, r11 + mov r4, r11 + mov r5, r7 + sub r6, r0 + sbc r7, r1 + sbc r4, r2 + sbc r5, r3 + eor r1, r1 + sbc r1, r1 + mov r2, r12 + mov r3, r8 + add r2, r6 + adc r3, r7 + mov r6, r9 + mov r7, r10 + adc r4, r6 + adc r5, r7 + adc r6, r1 + adc r7, r1 + //results r12, r8, r2-r7 + //////////END MIDDLE PART////////////////// + mvn r2, r2 + mvn r3, r3 + mvn r4, r4 + mvn r5, r5 + mvn r6, r6 + mvn r7, r7 + pop {r1} + push {r4-r7} + mov r4, #1 + asr r4, #1 + ldm r1!, {r4-r7} + mov r0, r12 + mov r12, r1 ////////ref + mov r1, r8 + mvn r0, r0 + mvn r1, r1 + adc r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r7 + eor r4, r4 + adc r4, r4 + mov r8, r4 //carry A --ini + mov r4, r12 + ldm r4, {r4-r7} + add r0, r4 + adc r1, r5 + adc r2, r6 + adc r3, r7 + mov r9, r4 + mov r4, r12 + stm r4!, {r0-r3} + mov r12, r4 + mov r4, r9 + pop {r0-r3} + adc r4, r0 + adc r5, r1 + adc r6, r2 + adc r7, r3 + eor r0, r0 + adc r0, r0 + mov r9, r0 //carry B --ini + mov r0, r8 + asr r0, #1 //carry A --end + pop {r0-r3} + adc r4, r0 + adc r5, r1 + adc r6, r2 + adc r7, r3 + mov r8, r0 + mov r0, r12 + stm r0!, {r4-r7} + mov r11, r0 + mov r0, r8 + eor r4, r4 + mov r5, r9 + adc r5, r4 //carry B --end + mvn r6, r4 + add r5, r6 + adc r6, r4 + add r0, r5 + adc r1, r6 + adc r2, r6 + adc r3, r6 + mov r7, r11 + stm r7!, {r0-r3} + + pop {r3-r6} + mov r8, r3 + mov r9, r4 + mov r10, r5 + mov r11, r6 + pop {r4-r7,pc} + bx lr + .size square256_asm, .-square256_asm |