morningman commented on code in PR #11381:
URL: https://github.com/apache/doris/pull/11381#discussion_r935043335


##########
be/src/vec/exec/format/parquet/schema_desc.cpp:
##########
@@ -17,17 +17,370 @@
 
 #include "schema_desc.h"
 
+#include "gutil/strings/substitute.h"
+
 namespace doris::vectorized {
 
-SchemaDescriptor::~SchemaDescriptor() {
-    //    fields.clear();
+static bool is_group_node(const tparquet::SchemaElement& schema) {
+    return schema.num_children > 0;
+}
+
+static bool is_list_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.converted_type && schema.converted_type == 
tparquet::ConvertedType::LIST;
+}
+
+static bool is_map_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.converted_type &&
+           (schema.converted_type == tparquet::ConvertedType::MAP ||
+            schema.converted_type == tparquet::ConvertedType::MAP_KEY_VALUE);
+}
+
+static bool is_repeated_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.repetition_type &&
+           schema.repetition_type == tparquet::FieldRepetitionType::REPEATED;
+}
+
+static bool is_required_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.repetition_type &&
+           schema.repetition_type == tparquet::FieldRepetitionType::REQUIRED;
+}
+
+static bool is_optional_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.repetition_type &&
+           schema.repetition_type == tparquet::FieldRepetitionType::OPTIONAL;
+}
+
+static int num_children_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.num_children ? schema.num_children : 0;
+}
+
+static void set_child_node_level(FieldSchema* parent, size_t rep_inc = 0, 
size_t def_inc = 0) {
+    for (auto& child : parent->children) {
+        child.repetition_level = parent->repetition_level + rep_inc;
+        child.definition_level = parent->definition_level + def_inc;
+    }
 }
 
-std::string SchemaDescriptor::debug_string() const {
-    return std::string();
+static bool is_struct_list_node(const tparquet::SchemaElement& schema) {
+    const std::string& name = schema.name;
+    static const Slice array_slice("array", 5);
+    static const Slice tuple_slice("_tuple", 6);
+    Slice slice(name);
+    return slice == array_slice || slice.ends_with(tuple_slice);
 }
 
 std::string FieldSchema::debug_string() const {
-    return std::string();
+    std::stringstream ss;
+    ss << "FieldSchema(name=" << name << ", R=" << repetition_level << ", D=" 
<< definition_level;
+    if (children.size() > 0) {
+        ss << ", type=" << type.type << ", children=[";
+        for (int i = 0; i < children.size(); ++i) {
+            if (i != 0) {
+                ss << ", ";
+            }
+            ss << children[i].debug_string();
+        }
+        ss << "]";
+    } else {
+        ss << ", physical_type=" << physical_type;
+    }
+    ss << ")";
+    return ss.str();
+}
+
+Status FieldDescriptor::parse_from_thrift(const 
std::vector<tparquet::SchemaElement>& t_schemas) {
+    if (t_schemas.size() == 0 || !is_group_node(t_schemas[0])) {
+        return Status::InvalidArgument("Wrong parquet root schema element");
+    }
+    auto& root_schema = t_schemas[0];
+    _fields.resize(root_schema.num_children);
+    _next_schema_pos = 1;
+    for (int i = 0; i < root_schema.num_children; ++i) {
+        RETURN_IF_ERROR(parse_node_field(t_schemas, _next_schema_pos, 
&_fields[i]));
+        if (_name_to_field.find(_fields[i].name) != _name_to_field.end()) {
+            return Status::InvalidArgument(

Review Comment:
   Same as all other places



##########
be/src/vec/exec/format/parquet/schema_desc.cpp:
##########
@@ -17,17 +17,370 @@
 
 #include "schema_desc.h"
 
+#include "gutil/strings/substitute.h"
+
 namespace doris::vectorized {
 
-SchemaDescriptor::~SchemaDescriptor() {
-    //    fields.clear();
+static bool is_group_node(const tparquet::SchemaElement& schema) {
+    return schema.num_children > 0;
+}
+
+static bool is_list_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.converted_type && schema.converted_type == 
tparquet::ConvertedType::LIST;
+}
+
+static bool is_map_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.converted_type &&
+           (schema.converted_type == tparquet::ConvertedType::MAP ||
+            schema.converted_type == tparquet::ConvertedType::MAP_KEY_VALUE);
+}
+
+static bool is_repeated_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.repetition_type &&
+           schema.repetition_type == tparquet::FieldRepetitionType::REPEATED;
+}
+
+static bool is_required_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.repetition_type &&
+           schema.repetition_type == tparquet::FieldRepetitionType::REQUIRED;
+}
+
+static bool is_optional_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.repetition_type &&
+           schema.repetition_type == tparquet::FieldRepetitionType::OPTIONAL;
+}
+
+static int num_children_node(const tparquet::SchemaElement& schema) {
+    return schema.__isset.num_children ? schema.num_children : 0;
+}
+
+static void set_child_node_level(FieldSchema* parent, size_t rep_inc = 0, 
size_t def_inc = 0) {
+    for (auto& child : parent->children) {
+        child.repetition_level = parent->repetition_level + rep_inc;
+        child.definition_level = parent->definition_level + def_inc;
+    }
 }
 
-std::string SchemaDescriptor::debug_string() const {
-    return std::string();
+static bool is_struct_list_node(const tparquet::SchemaElement& schema) {
+    const std::string& name = schema.name;
+    static const Slice array_slice("array", 5);
+    static const Slice tuple_slice("_tuple", 6);
+    Slice slice(name);
+    return slice == array_slice || slice.ends_with(tuple_slice);
 }
 
 std::string FieldSchema::debug_string() const {
-    return std::string();
+    std::stringstream ss;
+    ss << "FieldSchema(name=" << name << ", R=" << repetition_level << ", D=" 
<< definition_level;
+    if (children.size() > 0) {
+        ss << ", type=" << type.type << ", children=[";
+        for (int i = 0; i < children.size(); ++i) {
+            if (i != 0) {
+                ss << ", ";
+            }
+            ss << children[i].debug_string();
+        }
+        ss << "]";
+    } else {
+        ss << ", physical_type=" << physical_type;
+    }
+    ss << ")";
+    return ss.str();
+}
+
+Status FieldDescriptor::parse_from_thrift(const 
std::vector<tparquet::SchemaElement>& t_schemas) {
+    if (t_schemas.size() == 0 || !is_group_node(t_schemas[0])) {
+        return Status::InvalidArgument("Wrong parquet root schema element");
+    }
+    auto& root_schema = t_schemas[0];
+    _fields.resize(root_schema.num_children);
+    _next_schema_pos = 1;
+    for (int i = 0; i < root_schema.num_children; ++i) {
+        RETURN_IF_ERROR(parse_node_field(t_schemas, _next_schema_pos, 
&_fields[i]));
+        if (_name_to_field.find(_fields[i].name) != _name_to_field.end()) {
+            return Status::InvalidArgument(

Review Comment:
   You can just write:
   ```
   return Status::InvalidArgument("Duplicated field name: {}", _fields[i].name);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to