https://github.com/python/cpython/commit/61fc72a4a431cbfd42f22e2af76177c73431c3e6
commit: 61fc72a4a431cbfd42f22e2af76177c73431c3e6
branch: main
author: Gregory P. Smith <[email protected]>
committer: gpshead <[email protected]>
date: 2026-01-01T22:03:05-08:00
summary:
gh-124951: Optimize base64 encode & decode for an easy 2-3x speedup [no SIMD]
(GH-143262)
Optimize base64 encoding/decoding by eliminating loop-carried dependencies. Key
changes:
- Add `base64_encode_trio()` and `base64_decode_quad()` helper functions that
process complete groups independently
- Add `base64_encode_fast()` and `base64_decode_fast()` wrappers
- Update `b2a_base64` and `a2b_base64` to use fast path for complete groups
Performance gains (encode/decode speedup vs main, PGO builds):
```
64 bytes 64K 1M
Zen2: 1.2x/1.8x 1.7x/2.8x 1.5x/2.8x
Zen4: 1.2x/1.7x 1.6x/3.0x 1.5x/3.0x [old data, likely faster]
M4: 1.3x/1.9x 2.3x/2.8x 2.4x/2.9x [old data, likely faster]
RPi5-32: 1.2x/1.2x 2.4x/2.4x 2.0x/2.1x
```
Based on my exploratory work done in
https://github.com/python/cpython/compare/main...gpshead:cpython:claude/vectorize-base64-c-S7Hku
See PR and issue for further thoughts on sometimes MUCH faster SIMD vectorized
versions of this.
files:
A Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst
M Doc/whatsnew/3.15.rst
M Modules/binascii.c
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
index 11f08031ec54f2..dc4248655b4b2d 100644
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -428,6 +428,13 @@ argparse
inline code when color output is enabled.
(Contributed by Savannah Ostrowski in :gh:`142390`.)
+base64 & binascii
+-----------------
+
+* CPython's underlying base64 implementation now encodes 2x faster and decodes
3x
+ faster thanks to simple CPU pipelining optimizations.
+ (Contributed by Gregory P. Smith & Serhiy Storchaka in :gh:`143262`.)
+
calendar
--------
diff --git
a/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst
b/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst
new file mode 100644
index 00000000000000..10c7f8632d736b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst
@@ -0,0 +1,3 @@
+The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and
+related codec has been optimized for modern pipelined CPU architectures and
+now performs 2-3x faster across all platforms.
diff --git a/Modules/binascii.c b/Modules/binascii.c
index 13e4bc5be03ebd..a0a2960eef5ab0 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -76,11 +76,12 @@ get_binascii_state(PyObject *module)
}
-static const unsigned char table_a2b_base64[] = {
+/* Align to 64 bytes for L1 cache line friendliness */
+static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
- 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1, 0,-1,-1, /* Note PAD->0 */
+ 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,64,-1,-1, /* PAD->64 detected by
fast path */
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
@@ -101,9 +102,91 @@ static const unsigned char table_a2b_base64[] = {
/* Max binary chunk size; limited only by available memory */
#define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)
-static const unsigned char table_b2a_base64[] =
+/*
+ * Fast base64 encoding/decoding helpers.
+ *
+ * Process complete groups without loop-carried dependencies.
+ */
+
+/* Align to 64 bytes for L1 cache line friendliness */
+static const unsigned char table_b2a_base64[] Py_ALIGNED(64) =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+/* Encode 3 bytes into 4 base64 characters. */
+static inline void
+base64_encode_trio(const unsigned char *in, unsigned char *out,
+ const unsigned char *table)
+{
+ unsigned int combined = ((unsigned int)in[0] << 16) |
+ ((unsigned int)in[1] << 8) |
+ (unsigned int)in[2];
+ out[0] = table[(combined >> 18) & 0x3f];
+ out[1] = table[(combined >> 12) & 0x3f];
+ out[2] = table[(combined >> 6) & 0x3f];
+ out[3] = table[combined & 0x3f];
+}
+
+/* Encode multiple complete 3-byte groups.
+ * Returns the number of input bytes processed (always a multiple of 3).
+ */
+static inline Py_ssize_t
+base64_encode_fast(const unsigned char *in, Py_ssize_t in_len,
+ unsigned char *out, const unsigned char *table)
+{
+ Py_ssize_t n_trios = in_len / 3;
+ const unsigned char *in_end = in + n_trios * 3;
+
+ while (in < in_end) {
+ base64_encode_trio(in, out, table);
+ in += 3;
+ out += 4;
+ }
+
+ return n_trios * 3;
+}
+
+/* Decode 4 base64 characters into 3 bytes.
+ * Returns 1 on success, 0 if any character is invalid.
+ */
+static inline int
+base64_decode_quad(const unsigned char *in, unsigned char *out,
+ const unsigned char *table)
+{
+ unsigned char v0 = table[in[0]];
+ unsigned char v1 = table[in[1]];
+ unsigned char v2 = table[in[2]];
+ unsigned char v3 = table[in[3]];
+
+ if ((v0 | v1 | v2 | v3) & 0xc0) {
+ return 0;
+ }
+
+ out[0] = (v0 << 2) | (v1 >> 4);
+ out[1] = (v1 << 4) | (v2 >> 2);
+ out[2] = (v2 << 6) | v3;
+ return 1;
+}
+
+/* Decode multiple complete 4-character groups (no padding allowed).
+ * Returns the number of input characters processed.
+ * Stops at the first invalid character, padding, or incomplete group.
+ */
+static inline Py_ssize_t
+base64_decode_fast(const unsigned char *in, Py_ssize_t in_len,
+ unsigned char *out, const unsigned char *table)
+{
+ Py_ssize_t n_quads = in_len / 4;
+ Py_ssize_t i;
+
+ for (i = 0; i < n_quads; i++) {
+ if (!base64_decode_quad(in + i * 4, out + i * 3, table)) {
+ break;
+ }
+ }
+
+ return i * 4;
+}
+
static const unsigned short crctab_hqx[256] = {
0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
@@ -403,10 +486,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer
*data, int strict_mode)
goto error_end;
}
+ size_t i = 0; /* Current position in input */
+
+ /* Fast path: use optimized decoder for complete quads.
+ * This works for both strict and non-strict mode for valid input.
+ * The fast path stops at padding, invalid chars, or incomplete groups.
+ */
+ if (ascii_len >= 4) {
+ Py_ssize_t fast_chars = base64_decode_fast(ascii_data,
(Py_ssize_t)ascii_len,
+ bin_data, table_a2b_base64);
+ if (fast_chars > 0) {
+ i = (size_t)fast_chars;
+ bin_data += (fast_chars / 4) * 3;
+ }
+ }
+
+ /* Slow path: handle remaining input (padding, invalid chars, partial
groups) */
int quad_pos = 0;
unsigned char leftchar = 0;
int pads = 0;
- for (size_t i = 0; i < ascii_len; i++) {
+ for (; i < ascii_len; i++) {
unsigned char this_ch = ascii_data[i];
/* Check for pad sequences and ignore
@@ -533,9 +632,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data,
int newline)
/*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/
{
const unsigned char *bin_data;
- int leftbits = 0;
- unsigned char this_ch;
- unsigned int leftchar = 0;
Py_ssize_t bin_len;
binascii_state *state;
@@ -566,26 +662,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer
*data, int newline)
}
unsigned char *ascii_data = PyBytesWriter_GetData(writer);
- for( ; bin_len > 0 ; bin_len--, bin_data++ ) {
- /* Shift the data into our buffer */
- leftchar = (leftchar << 8) | *bin_data;
- leftbits += 8;
-
- /* See if there are 6-bit groups ready */
- while ( leftbits >= 6 ) {
- this_ch = (leftchar >> (leftbits-6)) & 0x3f;
- leftbits -= 6;
- *ascii_data++ = table_b2a_base64[this_ch];
- }
- }
- if ( leftbits == 2 ) {
- *ascii_data++ = table_b2a_base64[(leftchar&3) << 4];
+ /* Use the optimized fast path for complete 3-byte groups */
+ Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data,
+ table_b2a_base64);
+ bin_data += fast_bytes;
+ ascii_data += (fast_bytes / 3) * 4;
+ bin_len -= fast_bytes;
+
+ /* Handle remaining 0-2 bytes */
+ if (bin_len == 1) {
+ /* 1 byte remaining: produces 2 base64 chars + 2 padding */
+ unsigned int val = bin_data[0];
+ *ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f];
+ *ascii_data++ = table_b2a_base64[(val << 4) & 0x3f];
*ascii_data++ = BASE64_PAD;
*ascii_data++ = BASE64_PAD;
- } else if ( leftbits == 4 ) {
- *ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2];
+ }
+ else if (bin_len == 2) {
+ /* 2 bytes remaining: produces 3 base64 chars + 1 padding */
+ unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1];
+ *ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f];
+ *ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f];
+ *ascii_data++ = table_b2a_base64[(val << 2) & 0x3f];
*ascii_data++ = BASE64_PAD;
}
+
if (newline)
*ascii_data++ = '\n'; /* Append a courtesy newline */
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]