A 58 character encoding that: - avoids visually ambiguous 0OIl characters - uses only alphanumeric characters Described at: - https://tools.ietf.org/html/draft-msporny-base58-03
This implementation uses GMP (or gnulib's gmp fallback). Performance is good in comparison to other implementations. For example when using libgmp, encoding is 6 times faster, and decoding 28 times faster than the implementation using arbitrary precision ints in cypthon 3.13. Memory use is proportional to the size of input. Encoding benchmarks: $ time yes | head -c65535 | src/basenc --base58 -w0 >file.enc real 0m1.533s $ ./configure --without-libgmp && make # gnulib gmp $ time yes | head -c65535 | src/basenc --base58 -w0 >file.enc real 0m3.587s # dnf install python3-base58 $ time yes | head -c65535 | base58 >file.enc # cpython 3.13 real 0m9.700s Decoding benchmarks: $ time src/basenc --base58 -d <file.enc >/dev/null real 0m0.299s $ ./configure --without-libgmp && make # gnulib gmp $ time src/basenc --base58 -d <file.enc >/dev/null real 0m1.469s $ time base58 -d <file.enc >/dev/null # cpython 3.13 real 0m8.302s * src/basenc.c (base_decode_ctx_finalize, base_encode_ctx_init, base_encode_ctx, base_encode_ctx_finalize): New functions to provide more general processing functionality. (base58_{de,en}code_ctx{_init,,_finalize}): New functions to accumulate all input before calling ... (base58_{de,en}code): ... the GMP based encoding/decoding routines. (do_encode, do_decode): Call the ctx variants if enabled. * doc/coreutils.texi (basenc invocation): Describe the new option, and indicate the main use case being interactive user use. * src/local.mk: Link basenc with GMP. * tests/basenc/basenc.pl: Add test cases. * NEWS: Mention the new feature. --- NEWS | 5 + doc/coreutils.texi | 9 + src/basenc.c | 361 ++++++++++++++++++++++++++++++++++++++++- src/local.mk | 1 + tests/basenc/basenc.pl | 42 +++++ 5 files changed, 413 insertions(+), 5 deletions(-) diff --git a/NEWS b/NEWS index aa04c1884..fa2919421 100644 --- a/NEWS +++ b/NEWS @@ -68,6 +68,11 @@ GNU coreutils NEWS -*- outline -*- pretended that standard input was not a tty. [This bug was present in "the beginning".] +** New Features + + basenc supports the --base58 option to encode and decode + the visually unambiguous Base58 encoding. + ** Improvements cp, install and mv now avoid possible data corruption on diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 34ea70085..19de577e5 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -2370,6 +2370,15 @@ base64 form (using @samp{_} and @samp{-} instead of @samp{+} and @samp{/}). The format conforms to @uref{https://datatracker.ietf.org/doc/html/rfc4648#section-5, RFC 4648#5}. +@item --base58 +@opindex --base58 +Encode into (or decode from with @option{-d/--decode}) base58 form. +The format conforms to +@uref{https://datatracker.ietf.org/doc/html/draft-msporny-base58-03, +Base58 draft}. +This encoding is useful for transcription as the output avoids +visually similar characters. It's best suited to smaller amounts of data. + @item --base32 @opindex --base32 Encode into (or decode from with @option{-d/--decode}) base32 form. diff --git a/src/basenc.c b/src/basenc.c index 3f550a71f..5556bc82d 100644 --- a/src/basenc.c +++ b/src/basenc.c @@ -21,6 +21,7 @@ #include <stdio.h> #include <getopt.h> #include <sys/types.h> +#include <gmp.h> #include "system.h" #include "assure.h" @@ -61,6 +62,7 @@ enum { BASE64_OPTION = CHAR_MAX + 1, BASE64URL_OPTION, + BASE58_OPTION, BASE32_OPTION, BASE32HEX_OPTION, BASE16_OPTION, @@ -78,6 +80,7 @@ static struct option const long_options[] = #if BASE_TYPE == 42 {"base64", no_argument, 0, BASE64_OPTION}, {"base64url", no_argument, 0, BASE64URL_OPTION}, + {"base58", no_argument, 0, BASE58_OPTION}, {"base32", no_argument, 0, BASE32_OPTION}, {"base32hex", no_argument, 0, BASE32HEX_OPTION}, {"base16", no_argument, 0, BASE16_OPTION}, @@ -119,6 +122,9 @@ Base%d encode or decode FILE, or standard input, to standard output.\n\ "), stdout); fputs (_("\ --base64url file- and url-safe base64 (RFC4648 section 5)\n\ +"), stdout); + fputs (_("\ + --base58 visually unambiguous base58 encoding\n\ "), stdout); fputs (_("\ --base32 same as 'base32' program (RFC4648 section 6)\n\ @@ -263,6 +269,13 @@ struct z85_decode_context unsigned char octets[5]; }; +struct base58_context +{ + unsigned char *buf; + idx_t size; + idx_t capacity; +}; + struct base2_decode_context { unsigned char octet; @@ -277,6 +290,7 @@ struct base_decode_context struct base16_decode_context base16; struct base2_decode_context base2; struct z85_decode_context z85; + struct base58_context base58; } ctx; char *inbuf; idx_t bufsize; @@ -285,6 +299,23 @@ static void (*base_decode_ctx_init) (struct base_decode_context *ctx); static bool (*base_decode_ctx) (struct base_decode_context *ctx, char const *restrict in, idx_t inlen, char *restrict out, idx_t *outlen); +static bool (*base_decode_ctx_finalize) (struct base_decode_context *ctx, + char *restrict *out, idx_t *outlen); + +struct base_encode_context +{ + union { + struct base58_context base58; + } ctx; +}; + +static void (*base_encode_ctx_init) (struct base_encode_context *ctx); +static bool (*base_encode_ctx) (struct base_encode_context *ctx, + char const *restrict in, idx_t inlen, + char *restrict out, idx_t *outlen); +static bool (*base_encode_ctx_finalize) (struct base_encode_context *ctx, + char *restrict *out, idx_t *outlen); + #endif @@ -1036,6 +1067,269 @@ base2msbf_decode_ctx (struct base_decode_context *ctx, return true; } +static char const base58_alphabet[58] _GL_ATTRIBUTE_NONSTRING = + "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"; + +static signed char const base58_to_int[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, -1, -1, -1, + -1, 9, 10, 11, 12, 13, 14, 15, 16, -1, 17, 18, 19, 20, 21, -1, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, -1, -1, -1, -1, -1, + -1, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + +static bool +isubase58 (unsigned char ch) +{ + return ch < sizeof base58_to_int && 0 <= base58_to_int[ch]; +} + + +static int +base58_length (int len) +{ + /* Base58 output length is approximately log(256)/log(58). */ + return (len * 138) / 100 + 1; +} + + +static void +base58_encode_ctx_init (struct base_encode_context *ctx) +{ + ctx->ctx.base58.buf = nullptr; + ctx->ctx.base58.size = 0; + ctx->ctx.base58.capacity = 0; +} + + +static bool +base58_encode_ctx (struct base_encode_context *ctx, + char const *restrict in, idx_t inlen, + MAYBE_UNUSED char *restrict out, idx_t *outlen) +{ + *outlen = 0; /* Only accumulate input in this function. */ + + if (inlen == 0) + return true; + + idx_t free_space = ctx->ctx.base58.capacity - ctx->ctx.base58.size; + if (free_space < inlen) + { + ctx->ctx.base58.buf = xpalloc (ctx->ctx.base58.buf, + &ctx->ctx.base58.capacity, + inlen - free_space, + -1, sizeof *ctx->ctx.base58.buf); + } + + memcpy (ctx->ctx.base58.buf + ctx->ctx.base58.size, in, inlen); + ctx->ctx.base58.size += inlen; + + return true; +} + +static void +base58_encode (char const* data, size_t data_len, + char *out, idx_t *outlen) +{ + affirm (base_length (data_len) <= *outlen); + + size_t leading_zeros = 0; + while (leading_zeros < data_len && data[leading_zeros] == 0) + leading_zeros++; + + /* Init GMP integer from binary (base 256) data. */ + mpz_t num; + mpz_init (num); + mpz_import (num, data_len, 1, 1, 0, 0, data); + + char *ptr = out + *outlen; /* Start just beyond end. */ + + /* Convert to base 58 by repeatedly dividing by 58. */ + mpz_t quotient, remainder; + mpz_init (quotient); + mpz_init (remainder); + while (mpz_cmp_ui (num, 0) > 0) + { + mpz_fdiv_qr_ui (quotient, remainder, num, 58); + unsigned long rem_val = mpz_get_ui (remainder); + *(--ptr) = base58_alphabet[rem_val]; + mpz_set (num, quotient); + } + + /* Account for leading zeros. */ + ptr -= leading_zeros; + memset (ptr, '1', leading_zeros); + + /* Prepare return. */ + *outlen -= (ptr - out); + memmove (out, ptr, *outlen); + + mpz_clear (num); + mpz_clear (quotient); + mpz_clear (remainder); + + return; +} + + +static bool +base58_encode_ctx_finalize (struct base_encode_context *ctx, + char *restrict *out, idx_t *outlen) +{ + /* Ensure output buffer is large enough. */ + idx_t max_outlen = base_length (ctx->ctx.base58.size); + if (max_outlen > *outlen) + { + *out = xrealloc (*out, max_outlen); + *outlen = max_outlen; + } + + base58_encode ((char *)ctx->ctx.base58.buf, ctx->ctx.base58.size, + *out, outlen); + + free (ctx->ctx.base58.buf); + ctx->ctx.base58.buf = nullptr; + + return true; +} + + +static void +base58_decode_ctx_init (struct base_decode_context *ctx) +{ + ctx->ctx.base58.size = 0; + ctx->ctx.base58.capacity = 0; + ctx->ctx.base58.buf = nullptr; + ctx->i = 0; +} + +static bool +base58_decode_ctx (struct base_decode_context *ctx, + char const *restrict in, idx_t inlen, + MAYBE_UNUSED char *restrict out, idx_t *outlen) +{ + bool ignore_lines = true; /* for now, always ignore them */ + + *outlen = 0; /* Only accumulate input in this function. */ + + if (inlen == 0) + return true; + + idx_t free_space = ctx->ctx.base58.capacity - ctx->ctx.base58.size; + if (free_space < inlen) + { + ctx->ctx.base58.buf = xpalloc (ctx->ctx.base58.buf, + &ctx->ctx.base58.capacity, + inlen - free_space, + -1, sizeof *ctx->ctx.base58.buf); + } + + /* Accumulate all valid input characters in our buffer */ + for (idx_t i = 0; i < inlen; i++) + { + unsigned char c = in[i]; + + if (ignore_lines && c == '\n') + continue; + + if (c == '\0') + continue; + + if (!isubase58 (c)) + return false; + + ctx->ctx.base58.buf[ctx->ctx.base58.size++] = c; + } + + return true; +} + + +static bool +base58_decode (char const *data, size_t data_len, + char *restrict out, idx_t *outlen) +{ + affirm (data_len <= *outlen); + + /* Count leading '1's in the accumulated buffer */ + size_t leading_ones = 0; + while (leading_ones < data_len && data[leading_ones] == '1') + leading_ones++; + + /* Initialize GMP integer */ + mpz_t num; + mpz_init (num); + + /* Convert base58 string to GMP integer. */ + unsigned char const *b58_digits = (unsigned char *)data + leading_ones; + size_t b58_len = data_len - leading_ones; + + for (size_t i = 0; i < b58_len; i++) + { + int val = base58_to_int[b58_digits[i]]; + if (val == -1) + { + mpz_clear (num); + return false; + } + + mpz_mul_ui (num, num, 58); + mpz_add_ui (num, num, val); + } + + /* Add leading zeros for leading '1's */ + memset (out, 0, leading_ones); + + /* Export GMP integer to binary (big-endian) */ + size_t exported_size = 0; + if (mpz_cmp_ui (num, 0) > 0) + { + size_t binary_size = (mpz_sizeinbase (num, 2) + 7) / 8; + assert (*outlen - leading_ones >= binary_size); + mpz_export (out + leading_ones, &exported_size, 1, 1, 1, 0, num); + } + + *outlen = leading_ones + exported_size; + + mpz_clear (num); + + return true; +} + + +static bool +base58_decode_ctx_finalize (struct base_decode_context *ctx, + char *restrict *out, idx_t *outlen) +{ + /* Ensure output buffer is large enough. + Worst case is input is all '1's. */ + idx_t max_outlen = ctx->ctx.base58.size; + if (max_outlen > *outlen) + { + *out = xrealloc (*out, max_outlen); + *outlen = max_outlen; + } + + bool ret = base58_decode ((char *)ctx->ctx.base58.buf, ctx->ctx.base58.size, + *out, outlen); + + free (ctx->ctx.base58.buf); + ctx->ctx.base58.buf = nullptr; + + return ret; +} + #endif /* BASE_TYPE == 42, i.e., "basenc"*/ @@ -1095,6 +1389,14 @@ do_encode (FILE *in, char const *infile, FILE *out, idx_t wrap_column) inbuf = xmalloc (ENC_BLOCKSIZE); outbuf = xmalloc (BASE_LENGTH (ENC_BLOCKSIZE)); +#if BASE_TYPE == 42 + /* Initialize encoding context if needed (for base58) */ + struct base_encode_context encode_ctx; + bool use_ctx = (base_encode_ctx_init != nullptr); + if (use_ctx) + base_encode_ctx_init (&encode_ctx); +#endif + do { idx_t n; @@ -1109,16 +1411,38 @@ do_encode (FILE *in, char const *infile, FILE *out, idx_t wrap_column) if (sum > 0) { - /* Process input one block at a time. Note that ENC_BLOCKSIZE - is sized so that no pad chars will appear in output. */ - base_encode (inbuf, sum, outbuf, BASE_LENGTH (sum)); +#if BASE_TYPE == 42 + if (use_ctx) + { + idx_t outlen = 0; + base_encode_ctx (&encode_ctx, inbuf, sum, outbuf, &outlen); + + wrap_write (outbuf, outlen, wrap_column, ¤t_column, out); + } + else +#endif + { + /* Process input one block at a time. Note that ENC_BLOCKSIZE + is sized so that no pad chars will appear in output. */ + base_encode (inbuf, sum, outbuf, BASE_LENGTH (sum)); - wrap_write (outbuf, BASE_LENGTH (sum), wrap_column, - ¤t_column, out); + wrap_write (outbuf, BASE_LENGTH (sum), wrap_column, + ¤t_column, out); + } } } while (!feof (in) && !ferror (in) && sum == ENC_BLOCKSIZE); +#if BASE_TYPE == 42 + if (use_ctx && base_encode_ctx_finalize) + { + idx_t outlen = BASE_LENGTH (ENC_BLOCKSIZE); + base_encode_ctx_finalize (&encode_ctx, &outbuf, &outlen); + + wrap_write (outbuf, outlen, wrap_column, ¤t_column, out); + } +#endif + /* When wrapping, terminate last line. */ if (wrap_column && current_column > 0 && fputc ('\n', out) == EOF) write_error (); @@ -1208,6 +1532,20 @@ do_decode (FILE *in, char const *infile, FILE *out, bool ignore_garbage) } while (!feof (in)); +#if BASE_TYPE == 42 + if (base_decode_ctx_finalize) + { + idx_t outlen = DEC_BLOCKSIZE; + bool ok = base_decode_ctx_finalize (&ctx, &outbuf, &outlen); + + if (fwrite (outbuf, 1, outlen, out) < outlen) + write_error (); + + if (!ok) + error (EXIT_FAILURE, 0, _("invalid input")); + } +#endif + finish_and_exit (in, infile); } @@ -1268,6 +1606,7 @@ main (int argc, char **argv) case BASE2MSBF_OPTION: case BASE2LSBF_OPTION: case Z85_OPTION: + case BASE58_OPTION: base_type = opt; break; #endif @@ -1356,6 +1695,18 @@ main (int argc, char **argv) base_decode_ctx = z85_decode_ctx; break; + case BASE58_OPTION: + base_length = base58_length; + required_padding = no_required_padding; + isubase = isubase58; + base_encode_ctx_init = base58_encode_ctx_init; + base_encode_ctx = base58_encode_ctx; + base_encode_ctx_finalize = base58_encode_ctx_finalize; + base_decode_ctx_init = base58_decode_ctx_init; + base_decode_ctx = base58_decode_ctx; + base_decode_ctx_finalize = base58_decode_ctx_finalize; + break; + default: error (0, 0, _("missing encoding type")); usage (EXIT_FAILURE); diff --git a/src/local.mk b/src/local.mk index ef1035bcb..c7c77a7c9 100644 --- a/src/local.mk +++ b/src/local.mk @@ -286,6 +286,7 @@ src_sort_LDADD += $(NANOSLEEP_LIB) src_tail_LDADD += $(NANOSLEEP_LIB) # for various GMP functions +src_basenc_LDADD += $(LIBGMP) src_expr_LDADD += $(LIBGMP) src_factor_LDADD += $(LIBGMP) diff --git a/tests/basenc/basenc.pl b/tests/basenc/basenc.pl index c598d29e5..1d50a8121 100755 --- a/tests/basenc/basenc.pl +++ b/tests/basenc/basenc.pl @@ -72,6 +72,18 @@ my $base2msbf_ab = "0110000101100010"; my $base2msbf_ab_nl = $base2msbf_ab; $base2msbf_ab_nl =~ s/(...)/$1\n/g; # Add newline every 3 characters +# Base58 test vectors +my $base58_in = "Hello World!"; +my $base58_out = "2NEpo7TZRRrLZSi2U"; +my $base58_in2 = "\x00\x00\x28\x7f\xb4\xcd"; +my $base58_out2 = "11233QC4"; +my $base58_in3 = "\x00"; +my $base58_out3 = "1"; +my $base58_in4 = "1\x00"; +my $base58_out4 = "4jH"; +my $base58_large_ones = "1" x 32768; +my $base58_large_NULs = "\x00" x 32768; + my $try_help = "Try '$prog --help' for more information.\n"; my @Tests = @@ -100,6 +112,7 @@ my @Tests = ['empty6', '--base2msbf', {IN=>''}, {OUT=>""}], ['empty7', '--base2lsbf', {IN=>''}, {OUT=>""}], ['empty8', '--z85', {IN=>''}, {OUT=>""}], + ['empty9', '--base58', {IN=>''}, {OUT=>""}], @@ -266,6 +279,35 @@ my @Tests = {ERR=>"$prog: invalid input\n"}], ['z85_47', '--z85 -d', {IN=>'#0000'}, {EXIT=>1}, {ERR=>"$prog: invalid input\n"}], + + + + + ['b58_1', '--base58', {IN=>$base58_in}, {OUT=>$base58_out}], + ['b58_2', '--base58 -d', {IN=>$base58_out}, {OUT=>$base58_in}], + ['b58_3', '--base58 -d -i', {IN=>'&'. $base58_out}, {OUT=>$base58_in}], + ['b58_4', '--base58', {IN=>$base58_in2}, {OUT=>$base58_out2}], + ['b58_5', '--base58 -d', {IN=>$base58_out2}, {OUT=>$base58_in2}], + ['b58_6', '--base58', {IN=>$base58_in3}, {OUT=>$base58_out3}], + ['b58_7', '--base58 -d', {IN=>$base58_out3}, {OUT=>$base58_in3}], + ['b58_8', '--base58 -d', {IN=>$base58_out."\n"}, {OUT=>$base58_in}], + ['b58_9', '--base58 -d -i', {IN=>$base58_out."\n"}, {OUT=>$base58_in}], + ['b58_10', '--base58', {IN=>$base58_in4}, {OUT=>$base58_out4}], + ['b58_11', '--base58 -d', {IN=>$base58_out4}, {OUT=>$base58_in4}], + ['b58_buf1', '--base58', {IN=>$base58_large_NULs}, + {OUT=>$base58_large_ones}], + ['b58_buf2', '--base58 -d', {IN=>$base58_large_ones}, + {OUT=>$base58_large_NULs}], + + # Invalid base58 characters (0, O, I, l) + ['b58_inval_1', '--base58 -d', {IN=>'0'}, {EXIT=>1}, + {ERR=>"$prog: invalid input\n"}], + ['b58_inval_2', '--base58 -d', {IN=>'O'}, {EXIT=>1}, + {ERR=>"$prog: invalid input\n"}], + ['b58_inval_3', '--base58 -d', {IN=>'I'}, {EXIT=>1}, + {ERR=>"$prog: invalid input\n"}], + ['b58_inval_4', '--base58 -d', {IN=>'l'}, {EXIT=>1}, + {ERR=>"$prog: invalid input\n"}], ); # Prepend the command line argument and append a newline to end -- 2.50.0