benibus commented on code in PR #14100:
URL: https://github.com/apache/arrow/pull/14100#discussion_r976637404
##########
cpp/src/arrow/json/test_common.h:
##########
@@ -99,7 +99,7 @@ struct GenerateImpl {
template <typename T>
enable_if_base_binary<T, Status> Visit(const T&) {
auto size = std::poisson_distribution<>{4}(e);
- std::uniform_int_distribution<uint16_t> gen_char(32, 127); // FIXME
generate UTF8
+ std::uniform_int_distribution<uint16_t> gen_char(32, 126); // FIXME
generate UTF8
Review Comment:
I assume that ASCII DEL wasn't intended to be included in the test input...
##########
cpp/src/arrow/json/parser_benchmark.cc:
##########
@@ -30,24 +30,50 @@
namespace arrow {
namespace json {
-std::shared_ptr<Schema> TestSchema() {
- return schema({field("int", int32()), field("str", utf8())});
-}
-
-constexpr int seed = 0x432432;
-
-std::string TestJsonData(int num_rows, bool pretty = false) {
- std::default_random_engine engine(seed);
- std::string json;
- for (int i = 0; i < num_rows; ++i) {
- StringBuffer sb;
- Writer writer(sb);
- ABORT_NOT_OK(Generate(TestSchema(), engine, &writer));
- json += pretty ? PrettyPrint(sb.GetString()) : sb.GetString();
- json += "\n";
+class JSONGenerator {
+ public:
+ constexpr static int kSeed = 0x432432;
+
+ constexpr explicit JSONGenerator(bool pretty = false) : pretty_(pretty) {}
+
+ template <typename T>
+ std::string operator()(const T& input, int32_t num_rows) const {
+ std::default_random_engine engine(kSeed);
+ std::string json;
+ for (int i = 0; i < num_rows; ++i) {
+ StringBuffer sb;
+ Writer writer(sb);
+ ABORT_NOT_OK(Generate(input, engine, &writer));
+ json += pretty_ ? PrettyPrint(sb.GetString()) : sb.GetString();
+ json += "\n";
+ }
+ return json;
}
- return json;
+ private:
+ bool pretty_;
+};
+
+constexpr auto JSONString = JSONGenerator{false};
+constexpr auto JSONStringPretty = JSONGenerator{true};
+
+// Both field sets are the worst-case ordering of each other - i.e. the parser
cannot
+// reliably predict the next field in B given the definition of A.
+FieldVector TestFields1() {
+ return {
+ field("int", int32()),
+ field("str", utf8()),
+ };
+}
+FieldVector TestFields2() {
+ return {
+ field("str", utf8()),
+ field("int", int32()),
+ };
+}
Review Comment:
I held back on adding fields for now since I wasn't sure how it would affect
regression tests (for the ordered case, at least).
I also tried out randomly shuffling the fields for every row. Given the
parser's current prediction strategy and limited number of fields, this felt
unnecessary - but it would be a more robust solution. The new `JSONGenerator`
helper could be be altered to accept a callable rather than constant input.
##########
cpp/src/arrow/json/parser.cc:
##########
@@ -383,13 +392,14 @@ class RawArrayBuilder<Kind::kObject> {
private:
struct FieldInfo {
- string_view name; // Backed by key's allocation in name_to_index_
+ std::unique_ptr<std::string> name;
BuilderPtr builder;
};
Review Comment:
The simplest solution I could think of to avoid the `std::string` relocation
issues. The faster approach would be to store the allocations in a separate
`std::forward_list` and hold another `string_view` here, avoiding the
double-dereference (at a higher memory cost).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]