On Wed, 1 Oct 2025 17:39:32 +0800 Guan-Chun Wu <[email protected]> wrote:
> On Tue, Sep 30, 2025 at 05:11:12PM -0700, Caleb Sander Mateos wrote: > > On Fri, Sep 26, 2025 at 12:01 AM Guan-Chun Wu <[email protected]> > > wrote: > > > > > > The old base64 implementation relied on a bit-accumulator loop, which was > > > slow for larger inputs and too permissive in validation. It would accept > > > extra '=', missing '=', or even '=' appearing in the middle of the input, > > > allowing malformed strings to pass. This patch reworks the internals to > > > improve performance and enforce stricter validation. > > > > > > Changes: > > > - Encoder: > > > * Process input in 3-byte blocks, mapping 24 bits into four 6-bit > > > symbols, avoiding bit-by-bit shifting and reducing loop iterations. > > > * Handle the final 1-2 leftover bytes explicitly and emit '=' only when > > > requested. > > > - Decoder: > > > * Based on the reverse lookup tables from the previous patch, decode > > > input in 4-character groups. > > > * Each group is looked up directly, converted into numeric values, and > > > combined into 3 output bytes. > > > * Explicitly handle padded and unpadded forms: > > > - With padding: input length must be a multiple of 4, and '=' is > > > allowed only in the last two positions. Reject stray or early '='. > > > - Without padding: validate tail lengths (2 or 3 chars) and require > > > unused low bits to be zero. > > > * Removed the bit-accumulator style loop to reduce loop iterations. > > > > > > Performance (x86_64, Intel Core i7-10700 @ 2.90GHz, avg over 1000 runs, > > > KUnit): > > > > > > Encode: > > > 64B ~90ns -> ~32ns (~2.8x) > > > 1KB ~1332ns -> ~510ns (~2.6x) > > > > > > Decode: > > > 64B ~1530ns -> ~64ns (~23.9x) > > > 1KB ~27726ns -> ~982ns (~28.3x) > > > > > > Co-developed-by: Kuan-Wei Chiu <[email protected]> > > > Signed-off-by: Kuan-Wei Chiu <[email protected]> > > > Co-developed-by: Yu-Sheng Huang <[email protected]> > > > Signed-off-by: Yu-Sheng Huang <[email protected]> > > > Signed-off-by: Guan-Chun Wu <[email protected]> > > > --- > > > lib/base64.c | 150 +++++++++++++++++++++++++++++++++++++-------------- > > > 1 file changed, 110 insertions(+), 40 deletions(-) > > > > > > diff --git a/lib/base64.c b/lib/base64.c > > > index b20fdf168..fd1db4611 100644 > > > --- a/lib/base64.c > > > +++ b/lib/base64.c > > > @@ -93,26 +93,43 @@ static const s8 base64_rev_tables[][256] = { > > > int base64_encode(const u8 *src, int srclen, char *dst, bool padding, > > > enum base64_variant variant) > > > { > > > u32 ac = 0; > > > - int bits = 0; > > > - int i; > > > char *cp = dst; > > > const char *base64_table = base64_tables[variant]; > > > > > > - for (i = 0; i < srclen; i++) { > > > - ac = (ac << 8) | src[i]; > > > - bits += 8; > > > - do { > > > - bits -= 6; > > > - *cp++ = base64_table[(ac >> bits) & 0x3f]; > > > - } while (bits >= 6); > > > - } > > > - if (bits) { > > > - *cp++ = base64_table[(ac << (6 - bits)) & 0x3f]; > > > - bits -= 6; > > > + while (srclen >= 3) { > > > + ac = ((u32)src[0] << 16) | > > > + ((u32)src[1] << 8) | > > > + (u32)src[2]; > > > + > > > + *cp++ = base64_table[ac >> 18]; > > > + *cp++ = base64_table[(ac >> 12) & 0x3f]; > > > + *cp++ = base64_table[(ac >> 6) & 0x3f]; > > > + *cp++ = base64_table[ac & 0x3f]; > > > + > > > + src += 3; > > > + srclen -= 3; > > > } > > > - while (bits < 0) { > > > - *cp++ = '='; > > > - bits += 2; > > > + > > > + switch (srclen) { > > > + case 2: > > > + ac = ((u32)src[0] << 16) | > > > + ((u32)src[1] << 8); > > > + > > > + *cp++ = base64_table[ac >> 18]; > > > + *cp++ = base64_table[(ac >> 12) & 0x3f]; > > > + *cp++ = base64_table[(ac >> 6) & 0x3f]; > > > + if (padding) > > > + *cp++ = '='; > > > + break; > > > + case 1: > > > + ac = ((u32)src[0] << 16); > > > + *cp++ = base64_table[ac >> 18]; > > > + *cp++ = base64_table[(ac >> 12) & 0x3f]; > > > + if (padding) { > > > + *cp++ = '='; > > > + *cp++ = '='; > > > + } > > > + break; > > > } > > > return cp - dst; > > > } > > > @@ -128,39 +145,92 @@ EXPORT_SYMBOL_GPL(base64_encode); > > > * > > > * Decodes a string using the selected Base64 variant. > > > * > > > - * This implementation hasn't been optimized for performance. > > > - * > > > * Return: the length of the resulting decoded binary data in bytes, > > > * or -1 if the string isn't a valid Base64 string. > > > */ > > > int base64_decode(const char *src, int srclen, u8 *dst, bool padding, > > > enum base64_variant variant) > > > { > > > - u32 ac = 0; > > > - int bits = 0; > > > - int i; > > > u8 *bp = dst; > > > - s8 ch; > > > - > > > - for (i = 0; i < srclen; i++) { > > > - if (src[i] == '=') { > > > - ac = (ac << 6); > > > - bits += 6; > > > - if (bits >= 8) > > > - bits -= 8; > > > - continue; > > > - } > > > - ch = base64_rev_tables[variant][(u8)src[i]]; > > > - if (ch == -1) > > > + s8 input1, input2, input3, input4; > > > + u32 val; > > > + > > > + if (srclen == 0) > > > + return 0; > > > > Doesn't look like this special case is necessary; all the if and while > > conditions below are false if srclen == 0, so the function will just > > end up returning 0 in that case anyways. It would be nice to avoid > > this branch, especially as it seems like an uncommon case. > > > > You're right. I'll remove it. Thanks. > > > > + > > > + /* Validate the input length for padding */ > > > + if (unlikely(padding && (srclen & 0x03) != 0)) > > > + return -1; > > > + > > > + while (srclen >= 4) { > > > + /* Decode the next 4 characters */ > > > + input1 = base64_rev_tables[variant][(u8)src[0]]; > > > + input2 = base64_rev_tables[variant][(u8)src[1]]; > > > + input3 = base64_rev_tables[variant][(u8)src[2]]; > > > + input4 = base64_rev_tables[variant][(u8)src[3]]; > > > + > > > + /* Return error if any Base64 character is invalid */ > > > + if (unlikely(input1 < 0 || input2 < 0 || (!padding && > > > (input3 < 0 || input4 < 0)))) > > > + return -1; > > > + > > > + /* Handle padding */ > > > + if (unlikely(padding && ((input3 < 0 && input4 >= 0) || > > > + (input3 < 0 && src[2] != '=') || > > > + (input4 < 0 && src[3] != '=') || > > > + (srclen > 4 && (input3 < 0 || > > > input4 < 0))))) > > > > Would be preferable to check and strip the padding (i.e. decrease > > srclen) before this main loop. That way we could avoid several > > branches in this hot loop that are only necessary to handle the > > padding chars. > > > > You're right. As long as we check and strip the padding first, the > behavior with or without padding can be the same, and it could also > reduce some unnecessary branches. I'll make the change. As I said earlier. Calculate 'val' first using signed arithmetic. If it is non-negative there are three bytes to write. If negative then check for src[2] and src[3] being '=' (etc) before erroring out. That way there is only one check in the normal path. David > > Best regards, > Guan-Chun > > > > + return -1; > > > + val = ((u32)input1 << 18) | > > > + ((u32)input2 << 12) | > > > + ((u32)((input3 < 0) ? 0 : input3) << 6) | > > > + (u32)((input4 < 0) ? 0 : input4); > > > + > > > + *bp++ = (u8)(val >> 16); > > > + > > > + if (input3 >= 0) > > > + *bp++ = (u8)(val >> 8); > > > + if (input4 >= 0) > > > + *bp++ = (u8)val; > > > + > > > + src += 4; > > > + srclen -= 4; > > > + } > > > + > > > + /* Handle leftover characters when padding is not used */ > > > + if (!padding && srclen > 0) { > > > + switch (srclen) { > > > + case 2: > > > + input1 = base64_rev_tables[variant][(u8)src[0]]; > > > + input2 = base64_rev_tables[variant][(u8)src[1]]; > > > + if (unlikely(input1 < 0 || input2 < 0)) > > > + return -1; > > > + > > > + val = ((u32)input1 << 6) | (u32)input2; /* 12 > > > bits */ > > > + if (unlikely(val & 0x0F)) > > > + return -1; /* low 4 bits must be zero */ > > > + > > > + *bp++ = (u8)(val >> 4); > > > + break; > > > + case 3: > > > + input1 = base64_rev_tables[variant][(u8)src[0]]; > > > + input2 = base64_rev_tables[variant][(u8)src[1]]; > > > + input3 = base64_rev_tables[variant][(u8)src[2]]; > > > + if (unlikely(input1 < 0 || input2 < 0 || input3 < > > > 0)) > > > + return -1; > > > + > > > + val = ((u32)input1 << 12) | > > > + ((u32)input2 << 6) | > > > + (u32)input3; /* 18 bits */ > > > + > > > + if (unlikely(val & 0x03)) > > > + return -1; /* low 2 bits must be zero */ > > > + > > > + *bp++ = (u8)(val >> 10); > > > + *bp++ = (u8)((val >> 2) & 0xFF); > > > > "& 0xFF" is redundant with the cast to u8. > > > > Best, > > Caleb > > > > > + break; > > > + default: > > > return -1; > > > - ac = (ac << 6) | ch; > > > - bits += 6; > > > - if (bits >= 8) { > > > - bits -= 8; > > > - *bp++ = (u8)(ac >> bits); > > > } > > > } > > > - if (ac & ((1 << bits) - 1)) > > > - return -1; > > > + > > > return bp - dst; > > > } > > > EXPORT_SYMBOL_GPL(base64_decode); > > > -- > > > 2.34.1 > > > > > > >
