> have a good reference on why comparison operators are problematic? ie when > there's no actual branching based on it?
As the branch predictors improve, compilers tend to be updated to reintroduce branches, see: * What you get is what you C: Controlling side effects in mainstream C compilers * <https://www.cl.cam.ac.uk/~rja14/Papers/whatyouc.pdf> * <https://www.cl.cam.ac.uk/~rja14/Papers/whatyouc-slides.pdf> 1 int ct_is_non_zero_u32(uint32_t x) { 2 return (x | -x)>>31; 3 } 4 uint32_t ct_mask_u32(uint32_t bit) { 5 return -(uint32_t) ct_is_nonzero_u32(bit); 6 } 7 uint32_t ct_select_u32(uint32_t x, uint32_t y, bool bit /* ={0,1} */) { 8 // VERSION 1 9 uint32_t m = ct_mask_u32(bit); 10 return (x&m) | (y&~m); 11 12 // VERSION 2. Same as VERSION 1 but without 13 // using multiple functions 14 uint32_t m= -(uint32_t)((x|-x)>>31) ; 15 return (x&m) | (y&~m); 16 17 // VERSION 3 18 signed b = 1 - bit; 19 return (x*bit)|(y*b); 20 21 // VERSION 4 22 signed b = 0 - bit; 23 return (x&b) | (y&~b); 24 25 } Run On Clang 3.0, all the versions where constant-time, in 3.9 only the slowest version 1 with separate functions was left constant-time, branches were introduced in the rest. * See also <https://research.kudelskisecurity.com/2017/01/16/when-constant-time-source-may-not-save-you/> where a seemingly constant-time source was defeated by the Microsoft compiler using an "optimized" 64-bit multiplication, which led to leaking a private key. In short, and sadly, you need assembly not only for performance but also for security for cryptography. > my assumption here would be that the built-ins don't have any comparison > problems since they fetch the carry bit from a cpu flag - ie any time you're > doing "crypto stuff", you kind of make assumptions about the compiler you're > working with given the lack of standardization around constant-time-ness. This is also covered by previous answer. One thing to note is that surprisingly, compilers have no notion of carry-flag or overflow-flag in their IR, this is something that is one-level lower, for LLVM in the MIR (MachineIR). The proper generation of add-with-carry depends on pattern matching, i.e. the "missed optimization" reports from @chfast. Unfortunately compilers optimize for speed and there is no feature-flag for "secret data, don't branch". 2 bad codegen cases: 1. compiler differences, <https://gcc.godbolt.org/z/2h768y> #include <stdint.h> #include <x86intrin.h> void add256(uint64_t a[4], uint64_t b[4]){ uint8_t carry = 0; for (int i = 0; i < 4; ++i) carry = _addcarry_u64(carry, a[i], b[i], &a[i]); } Run GCC 9 asm add256: movq (%rsi), %rax addq (%rdi), %rax setc %dl movq %rax, (%rdi) movq 8(%rdi), %rax addb $-1, %dl adcq 8(%rsi), %rax setc %dl movq %rax, 8(%rdi) movq 16(%rdi), %rax addb $-1, %dl adcq 16(%rsi), %rax setc %dl movq %rax, 16(%rdi) movq 24(%rsi), %rax addb $-1, %dl adcq %rax, 24(%rdi) ret Run Clang 9 asm (perfect and there is an unit-test) add256: movq (%rsi), %rax addq %rax, (%rdi) movq 8(%rsi), %rax adcq %rax, 8(%rdi) movq 16(%rsi), %rax adcq %rax, 16(%rdi) movq 24(%rsi), %rax adcq %rax, 24(%rdi) retq Run 2. Same compiler, different backend Annoyingly, while some pattern matching or builtins might work well on x86-64, they don't on other architectures, like Nvidia GPU. Even when using builtin types like _ExtInt(128) that directly map to LLVM i128. // Compile with LLVM // /usr/lib/llvm13/bin/clang++ -S -emit-llvm \ // build/nvidia/wideint128.cu \ // --cuda-gpu-arch=sm_86 \ // -L/opt/cuda/lib64 \ // -lcudart_static -ldl -lrt -pthread // /usr/lib/llvm13/bin/clang++ build/nvidia/wideint128.cu \ // -o build/nvidia/wideint128 \ // --cuda-gpu-arch=sm_86 \ // -L/opt/cuda/lib64 \ // -lcudart_static -ldl -lrt -pthread // llc -mcpu=sm_86 build/nvidia/wideint128-cuda-nvptx64-nvidia-cuda-sm_86.ll -o build/nvidia/wideint128_llvm.ptx #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <cstdint> #include <stdio.h> typedef _ExtInt(128) u128; cudaError_t add128(); __global__ void add128Kernel(u128* r, u128 a, u128 b) { *r = a + b; // for (int i = 0; i < 16; i++) { // printf("%02X", ((unsigned char*)(r))[i]); // } } int main() { cudaError_t cudaStatus = add128(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!"); return 1; } cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } getchar(); return 0; } cudaError_t add128() { cudaError_t cudaStatus; cudaStatus = cudaSetDevice(0); u128 r; u128 a = 0xFFFFFFFFFFFFFFFFULL; u128 b = 0x0010000000000000ULL; add128Kernel<<<1, 1>>>(&r, a, b); cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } Error: return cudaStatus; } Run LLVM IR (properly uses i128) ; ModuleID = 'build/nvidia/wideint128.cu' source_filename = "build/nvidia/wideint128.cu" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" ; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone define dso_local void @_Z12add128KernelPU7_ExtIntILi128EEiS_S_(i128* %0, i128 %1, i128 %2) #0 { %4 = alloca i128*, align 8 %5 = alloca i128, align 8 %6 = alloca i128, align 8 store i128* %0, i128** %4, align 8 store i128 %1, i128* %5, align 8 store i128 %2, i128* %6, align 8 %7 = load i128, i128* %5, align 8 %8 = load i128, i128* %6, align 8 %9 = add nsw i128 %7, %8 %10 = load i128*, i128** %4, align 8 store i128 %9, i128* %10, align 8 ret void } attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_86" "target-features"="+ptx72,+sm_86" } !llvm.module.flags = !{!0, !1, !2, !3} !nvvm.annotations = !{!4} !llvm.ident = !{!5, !6} !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 2]} !1 = !{i32 1, !"wchar_size", i32 4} !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} !3 = !{i32 7, !"frame-pointer", i32 2} !4 = !{void (i128*, i128, i128)* @_Z12add128KernelPU7_ExtIntILi128EEiS_S_, !"kernel", i32 1} !5 = !{!"clang version 13.0.1"} !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} Run Nvidia PTX // // Generated by LLVM NVPTX Back-End // .version 7.1 .target sm_86 .address_size 64 // .globl _Z12add128KernelPU7_ExtIntILi128EEiS_S_ // -- Begin function _Z12add128KernelPU7_ExtIntILi128EEiS_S_ // @_Z12add128KernelPU7_ExtIntILi128EEiS_S_ .visible .entry _Z12add128KernelPU7_ExtIntILi128EEiS_S_( .param .u64 _Z12add128KernelPU7_ExtIntILi128EEiS_S__param_0, .param .align 16 .b8 _Z12add128KernelPU7_ExtIntILi128EEiS_S__param_1[16], .param .align 16 .b8 _Z12add128KernelPU7_ExtIntILi128EEiS_S__param_2[16] ) { .local .align 8 .b8 __local_depot0[40]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<3>; .reg .b64 %rd<18>; // %bb.0: mov.u64 %SPL, __local_depot0; cvta.local.u64 %SP, %SPL; ld.param.v2.u64 {%rd4, %rd5}, [_Z12add128KernelPU7_ExtIntILi128EEiS_S__param_2]; ld.param.v2.u64 {%rd2, %rd3}, [_Z12add128KernelPU7_ExtIntILi128EEiS_S__param_1]; ld.param.u64 %rd1, [_Z12add128KernelPU7_ExtIntILi128EEiS_S__param_0]; cvta.to.global.u64 %rd6, %rd1; cvta.global.u64 %rd7, %rd6; st.u64 [%SP+0], %rd7; st.u64 [%SP+16], %rd3; st.u64 [%SP+8], %rd2; st.u64 [%SP+32], %rd5; st.u64 [%SP+24], %rd4; ld.u64 %rd8, [%SP+16]; ld.u64 %rd9, [%SP+8]; ld.u64 %rd10, [%SP+32]; ld.u64 %rd11, [%SP+24]; add.s64 %rd12, %rd9, %rd11; setp.lt.u64 %p1, %rd12, %rd11; setp.lt.u64 %p2, %rd12, %rd9; selp.u64 %rd13, 1, 0, %p2; selp.b64 %rd14, 1, %rd13, %p1; add.s64 %rd15, %rd8, %rd10; add.s64 %rd16, %rd15, %rd14; ld.u64 %rd17, [%SP+0]; st.u64 [%rd17], %rd12; st.u64 [%rd17+8], %rd16; ret; // -- End function } Run but PTX supports: * addc for add with carry-in, * add.cc for add with carry-out * addc.cc for add with carry in and carry-out > Given the state of compilers, it's perhaps slightly beyond the reach of the > primitives here to test for that, although it would indeed be a huge > advantage if it was verified in unit tests. LLVM for example tests the > assembly output of a given program in certain cases, it would actually be > pretty cool to have a test that ensures that for a given platform, a > "construct" compiles to a certain instruction so that "crypto stuff" can be > built more reliably. Yes, I actually spent some time looking into LLVM tests, for example add256 at <https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/X86/addcarry.ll#L47-L63> but those are architecture specific, no such guarantee on Nvidia GPUs, so I need my own GPU code generator (via LLVM IR inline assembly) despite not having constant-time requirement on GPUs :/