wgtmac commented on code in PR #150: URL: https://github.com/apache/iceberg-cpp/pull/150#discussion_r2221859777
########## src/iceberg/util/string_utils.h: ########## @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <algorithm> +#include <ranges> +#include <string> + +namespace iceberg::internal { + +class StringUtils { Review Comment: Either add `ICEBERG_EXPORT` or rename to `string_utils_internal.h`. As you have included it in the public file_format.h, I think we need to add `ICEBERG_EXPORT`. ########## src/iceberg/manifest_reader_internal.cc: ########## @@ -38,6 +38,121 @@ namespace iceberg { return InvalidArrowData("Nanoarrow error: {}", error.message); \ } +#define PARSE_PRIMITIVE_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_ENUM_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_STRING_FIELD(item, array_view) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetStringUnsafe(array_view, row_idx); \ + std::string path_str(value.data, value.size_bytes); \ + item = path_str; \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_BINARY_FIELD(item, array_view) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(view_of_column, row_idx)) { \ + auto buffer = ArrowArrayViewGetBytesUnsafe(array_view, row_idx); \ + item = std::vector<uint8_t>(buffer.data.as_char, \ + buffer.data.as_char + buffer.size_bytes); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_PRIMITIVE_VECTOR_FIELD(item, count, array_view, type) \ + for (int64_t manifest_idx = 0; manifest_idx < count; manifest_idx++) { \ + auto offset = ArrowArrayViewListChildOffset(array_view, manifest_idx); \ + auto next_offset = ArrowArrayViewListChildOffset(array_view, manifest_idx + 1); \ + for (int64_t offset_idx = offset; offset_idx < next_offset; offset_idx++) { \ + item.emplace_back(static_cast<type>( \ + ArrowArrayViewGetIntUnsafe(array_view->children[0], offset_idx))); \ + } \ + } + +#define PARSE_PRIMITIVE_MAP_FIELD(item, count, array_view) \ + do { \ + if (array_view->storage_type != ArrowType::NANOARROW_TYPE_MAP) { \ + return InvalidManifest("Field:{} should be a map.", field_name); \ + } \ + auto view_of_map = array_view->children[0]; \ + ASSERT_VIEW_TYPE_AND_CHILDREN(view_of_map, ArrowType::NANOARROW_TYPE_STRUCT, 2); \ + auto view_of_map_key = view_of_map->children[0]; \ + ASSERT_VIEW_TYPE(view_of_map_key, ArrowType::NANOARROW_TYPE_INT32); \ + auto view_of_map_value = view_of_map->children[1]; \ + ASSERT_VIEW_TYPE(view_of_map_value, ArrowType::NANOARROW_TYPE_INT64); \ + for (int64_t row_idx = 0; row_idx < count; row_idx++) { \ + auto offset = array_view->buffer_views[1].data.as_int32[row_idx]; \ + auto next_offset = array_view->buffer_views[1].data.as_int32[row_idx + 1]; \ + for (int32_t offset_idx = offset; offset_idx < next_offset; offset_idx++) { \ + auto key = ArrowArrayViewGetIntUnsafe(view_of_map_key, offset_idx); \ + auto value = ArrowArrayViewGetIntUnsafe(view_of_map_value, offset_idx); \ + item[key] = value; \ + } \ + } \ + } while (0) + +#define PARSE_BINARY_MAP_FIELD(item, count, array_view) \ Review Comment: Same question here. ########## src/iceberg/manifest_reader_internal.cc: ########## @@ -38,6 +38,121 @@ namespace iceberg { return InvalidArrowData("Nanoarrow error: {}", error.message); \ } +#define PARSE_PRIMITIVE_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_ENUM_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_STRING_FIELD(item, array_view) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetStringUnsafe(array_view, row_idx); \ + std::string path_str(value.data, value.size_bytes); \ + item = path_str; \ Review Comment: ```suggestion item = std::string(value.data, value.size_bytes); \ ``` ########## src/iceberg/manifest_reader_internal.cc: ########## @@ -38,6 +38,121 @@ namespace iceberg { return InvalidArrowData("Nanoarrow error: {}", error.message); \ } +#define PARSE_PRIMITIVE_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_ENUM_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_STRING_FIELD(item, array_view) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetStringUnsafe(array_view, row_idx); \ + std::string path_str(value.data, value.size_bytes); \ + item = path_str; \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_BINARY_FIELD(item, array_view) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(view_of_column, row_idx)) { \ + auto buffer = ArrowArrayViewGetBytesUnsafe(array_view, row_idx); \ + item = std::vector<uint8_t>(buffer.data.as_char, \ + buffer.data.as_char + buffer.size_bytes); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_PRIMITIVE_VECTOR_FIELD(item, count, array_view, type) \ Review Comment: ditto, should we rename it to `PARSE_INTEGER_VECTOR_FIELD`? ########## src/iceberg/manifest_reader_internal.cc: ########## @@ -38,6 +38,121 @@ namespace iceberg { return InvalidArrowData("Nanoarrow error: {}", error.message); \ } +#define PARSE_PRIMITIVE_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_ENUM_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_STRING_FIELD(item, array_view) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetStringUnsafe(array_view, row_idx); \ + std::string path_str(value.data, value.size_bytes); \ + item = path_str; \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_BINARY_FIELD(item, array_view) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(view_of_column, row_idx)) { \ + auto buffer = ArrowArrayViewGetBytesUnsafe(array_view, row_idx); \ + item = std::vector<uint8_t>(buffer.data.as_char, \ + buffer.data.as_char + buffer.size_bytes); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_PRIMITIVE_VECTOR_FIELD(item, count, array_view, type) \ + for (int64_t manifest_idx = 0; manifest_idx < count; manifest_idx++) { \ + auto offset = ArrowArrayViewListChildOffset(array_view, manifest_idx); \ + auto next_offset = ArrowArrayViewListChildOffset(array_view, manifest_idx + 1); \ + for (int64_t offset_idx = offset; offset_idx < next_offset; offset_idx++) { \ + item.emplace_back(static_cast<type>( \ + ArrowArrayViewGetIntUnsafe(array_view->children[0], offset_idx))); \ + } \ + } + +#define PARSE_PRIMITIVE_MAP_FIELD(item, count, array_view) \ + do { \ + if (array_view->storage_type != ArrowType::NANOARROW_TYPE_MAP) { \ + return InvalidManifest("Field:{} should be a map.", field_name); \ + } \ + auto view_of_map = array_view->children[0]; \ + ASSERT_VIEW_TYPE_AND_CHILDREN(view_of_map, ArrowType::NANOARROW_TYPE_STRUCT, 2); \ + auto view_of_map_key = view_of_map->children[0]; \ + ASSERT_VIEW_TYPE(view_of_map_key, ArrowType::NANOARROW_TYPE_INT32); \ + auto view_of_map_value = view_of_map->children[1]; \ + ASSERT_VIEW_TYPE(view_of_map_value, ArrowType::NANOARROW_TYPE_INT64); \ + for (int64_t row_idx = 0; row_idx < count; row_idx++) { \ + auto offset = array_view->buffer_views[1].data.as_int32[row_idx]; \ + auto next_offset = array_view->buffer_views[1].data.as_int32[row_idx + 1]; \ + for (int32_t offset_idx = offset; offset_idx < next_offset; offset_idx++) { \ + auto key = ArrowArrayViewGetIntUnsafe(view_of_map_key, offset_idx); \ + auto value = ArrowArrayViewGetIntUnsafe(view_of_map_value, offset_idx); \ + item[key] = value; \ + } \ + } \ + } while (0) + +#define PARSE_BINARY_MAP_FIELD(item, count, array_view) \ Review Comment: BTW, I think `PARSE_PRIMITIVE_MAP_FIELD` and `PARSE_BINARY_MAP_FIELD` can be combined into a single macro/template if we make key/value parsing and `item[key]` assignment as function arguments. ########## src/iceberg/manifest_reader_internal.cc: ########## @@ -38,6 +38,121 @@ namespace iceberg { return InvalidArrowData("Nanoarrow error: {}", error.message); \ } +#define PARSE_PRIMITIVE_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_ENUM_FIELD(item, array_view, type) \ Review Comment: Any difference with `PARSE_PRIMITIVE_FIELD` above? ########## src/iceberg/avro/avro_schema_util_internal.h: ########## @@ -144,4 +144,6 @@ std::string ToString(const ::avro::LogicalType::Type& logical_type); /// \return True if the node has a map logical type, false otherwise. bool HasMapLogicalType(const ::avro::NodePtr& node); +void RegisterLogicalTypes(); Review Comment: I think eventually we can add a new file `iceberg/avro/register.h` to register logical type, avro reader and avro writer in a single call. ########## src/iceberg/manifest_reader_internal.cc: ########## @@ -38,6 +38,121 @@ namespace iceberg { return InvalidArrowData("Nanoarrow error: {}", error.message); \ } +#define PARSE_PRIMITIVE_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_ENUM_FIELD(item, array_view, type) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetIntUnsafe(array_view, row_idx); \ + item = static_cast<type>(value); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_STRING_FIELD(item, array_view) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ + auto value = ArrowArrayViewGetStringUnsafe(array_view, row_idx); \ + std::string path_str(value.data, value.size_bytes); \ + item = path_str; \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_BINARY_FIELD(item, array_view) \ + for (size_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ + if (!ArrowArrayViewIsNull(view_of_column, row_idx)) { \ + auto buffer = ArrowArrayViewGetBytesUnsafe(array_view, row_idx); \ + item = std::vector<uint8_t>(buffer.data.as_char, \ + buffer.data.as_char + buffer.size_bytes); \ + } else if (required) { \ + return InvalidManifestList("Field {} is required but null at row {}", field_name, \ + row_idx); \ + } \ + } + +#define PARSE_PRIMITIVE_VECTOR_FIELD(item, count, array_view, type) \ + for (int64_t manifest_idx = 0; manifest_idx < count; manifest_idx++) { \ + auto offset = ArrowArrayViewListChildOffset(array_view, manifest_idx); \ + auto next_offset = ArrowArrayViewListChildOffset(array_view, manifest_idx + 1); \ + for (int64_t offset_idx = offset; offset_idx < next_offset; offset_idx++) { \ + item.emplace_back(static_cast<type>( \ + ArrowArrayViewGetIntUnsafe(array_view->children[0], offset_idx))); \ + } \ + } + +#define PARSE_PRIMITIVE_MAP_FIELD(item, count, array_view) \ Review Comment: This is restricted to `map<int, long>`? Perhaps use a clearer name? ########## src/iceberg/manifest_reader_internal.cc: ########## @@ -38,6 +38,121 @@ namespace iceberg { return InvalidArrowData("Nanoarrow error: {}", error.message); \ } +#define PARSE_PRIMITIVE_FIELD(item, array_view, type) \ Review Comment: Should we rename it to `PARSE_INTEGER_FIELD`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
