pitrou commented on a change in pull request #10586: URL: https://github.com/apache/arrow/pull/10586#discussion_r658068663
########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -93,19 +93,23 @@ struct BinaryLength { } }; +template <typename OffsetValue> +OffsetValue CountCodepoints(const uint8_t* str, size_t strlen) { Review comment: There's no need to template this on `OffsetValue`, just return `int64_t`. Also, since this is a generally useful function, perhaps put it in `arrow/util/utf8.h`? ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -2755,6 +2759,139 @@ Result<ValueDescr> StrptimeResolve(KernelContext* ctx, const std::vector<ValueDe return Status::Invalid("strptime does not provide default StrptimeOptions"); } +// ---------------------------------------------------------------------- +// string padding + +template <bool PadLeft, bool PadRight> +struct AsciiPadTransform : public StringTransformBase { + using State = OptionsWrapper<PadOptions>; + + const PadOptions& options_; + + explicit AsciiPadTransform(const PadOptions& options) : options_(options) {} + + Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override { + if (options_.padding.size() != 1) { + return Status::Invalid("Padding must be one byte, got '", options_.padding, "'"); + } + return Status::OK(); + } + + int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override { + // This is likely very overallocated but hard to do better without + // actually looking at each string (because of strings that may be + // longer than the given width) + return input_ncodeunits + ninputs * options_.width; + } + + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + if (input_string_ncodeunits >= options_.width) { + std::copy(input, input + input_string_ncodeunits, output); + return input_string_ncodeunits; + } + const int64_t spaces = options_.width - input_string_ncodeunits; + int64_t left = 0; + int64_t right = 0; + if (PadLeft && PadRight) { + // If odd number of spaces, put them on the left + right = spaces / 2; + left = spaces - right; + } else if (PadLeft) { + left = spaces; + } else if (PadRight) { + right = spaces; + } else { + DCHECK(false) << "unreachable"; + return 0; + } + std::fill(output, output + left, options_.padding[0]); + output += left; + output = std::copy(input, input + input_string_ncodeunits, output); + std::fill(output, output + right, options_.padding[0]); + return options_.width; + } +}; + +template <bool PadLeft, bool PadRight> +struct Utf8PadTransform : public StringTransformBase { + using State = OptionsWrapper<PadOptions>; + + const PadOptions& options_; + + explicit Utf8PadTransform(const PadOptions& options) : options_(options) {} + + Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override { + auto str = reinterpret_cast<const uint8_t*>(options_.padding.data()); + auto strlen = options_.padding.size(); + if (CountCodepoints<int64_t>(str, strlen) != 1) { + return Status::Invalid("Padding must be one codepoint, got '", options_.padding, + "'"); + } + return Status::OK(); + } + + int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override { + // This is likely very overallocated but hard to do better without + // actually looking at each string (because of strings that may be + // longer than the given width) + // One codepoint may be up to 4 bytes + return input_ncodeunits + 4 * ninputs * options_.width; + } + + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output) { + // XXX input_string_ncodeunits is misleading - it's just bytes Review comment: "codeunits" is [the same](https://unicode.org/glossary/#code_unit) as bytes for UTF8 ("codeunits" as opposed to "codepoints"). But, yes, it's pointlessly pedantic. Perhaps a wholesale renaming would be useful. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org