felipecrv commented on code in PR #38269:
URL: https://github.com/apache/arrow/pull/38269#discussion_r1361109114


##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -154,6 +137,332 @@ TEST(AzureFileSystem, OptionsCompare) {
   EXPECT_TRUE(options.Equals(options));
 }
 
+class TestAzureFileSystem : public ::testing::Test {
+ public:
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<Azure::Storage::Blobs::BlobServiceClient> service_client_;
+  AzureOptions options_;
+  std::mt19937_64 generator_;
+  std::string container_name_;
+
+  void MakeFileSystem() {
+    const std::string& account_name = GetAzuriteEnv()->account_name();
+    const std::string& account_key = GetAzuriteEnv()->account_key();
+    options_.backend = AzureBackend::Azurite;
+    ASSERT_OK(options_.ConfigureAccountKeyCredentials(account_name, 
account_key));
+  }
+
+  void SetUp() override {
+    ASSERT_THAT(GetAzuriteEnv(), NotNull());
+    ASSERT_OK(GetAzuriteEnv()->status());
+
+    MakeFileSystem();
+    generator_ = std::mt19937_64(std::random_device()());

Review Comment:
   This could be moved to the constructor of `TestAzureFileSystem` -- the 
random generator doesn't have to be re-initialized between test cases.



##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -154,6 +137,332 @@ TEST(AzureFileSystem, OptionsCompare) {
   EXPECT_TRUE(options.Equals(options));
 }
 
+class TestAzureFileSystem : public ::testing::Test {
+ public:
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<Azure::Storage::Blobs::BlobServiceClient> service_client_;
+  AzureOptions options_;
+  std::mt19937_64 generator_;
+  std::string container_name_;
+
+  void MakeFileSystem() {
+    const std::string& account_name = GetAzuriteEnv()->account_name();
+    const std::string& account_key = GetAzuriteEnv()->account_key();
+    options_.backend = AzureBackend::Azurite;
+    ASSERT_OK(options_.ConfigureAccountKeyCredentials(account_name, 
account_key));
+  }
+
+  void SetUp() override {
+    ASSERT_THAT(GetAzuriteEnv(), NotNull());
+    ASSERT_OK(GetAzuriteEnv()->status());
+
+    MakeFileSystem();
+    generator_ = std::mt19937_64(std::random_device()());
+    container_name_ = RandomChars(32);
+    service_client_ = 
std::make_shared<Azure::Storage::Blobs::BlobServiceClient>(
+        options_.account_blob_url, options_.storage_credentials_provider);
+    ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_));
+    auto container_client = 
service_client_->GetBlobContainerClient(container_name_);
+    container_client.CreateIfNotExists();
+
+    auto blob_client = 
container_client.GetBlockBlobClient(PreexistingObjectName());
+    blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(kLoremIpsum),
+                           strlen(kLoremIpsum));
+  }
+
+  void TearDown() override {
+    auto containers = service_client_->ListBlobContainers();
+    for (auto container : containers.BlobContainers) {
+      auto container_client = 
service_client_->GetBlobContainerClient(container.Name);
+      container_client.DeleteIfExists();
+    }
+  }
+
+  std::string PreexistingContainerName() const { return container_name_; }
+
+  std::string PreexistingContainerPath() const {
+    return PreexistingContainerName() + '/';
+  }
+
+  static std::string PreexistingObjectName() { return "test-object-name"; }
+
+  std::string PreexistingObjectPath() const {
+    return PreexistingContainerPath() + PreexistingObjectName();
+  }
+
+  std::string NotFoundObjectPath() { return PreexistingContainerPath() + 
"not-found"; }
+
+  std::string RandomLine(int lineno, std::size_t width) {
+    auto line = std::to_string(lineno) + ":    ";
+    line += RandomChars(width - line.size() - 1);
+    line += '\n';
+    return line;
+  }
+
+  uint8_t RandomInteger() {
+    return std::uniform_int_distribution<std::uint8_t>()(generator_);
+  }

Review Comment:
   Unused?



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -37,34 +43,331 @@ bool AzureOptions::Equals(const AzureOptions& other) const 
{
           credentials_kind == other.credentials_kind);
 }
 
+Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& 
account_name,
+                                                    const std::string& 
account_key) {
+  if (this->backend == AzureBackend::Azurite) {
+    account_blob_url = "http://127.0.0.1:10000/"; + account_name + "/";
+    account_dfs_url = "http://127.0.0.1:10000/"; + account_name + "/";
+  } else {
+    account_dfs_url = "https://"; + account_name + ".dfs.core.windows.net/";
+    account_blob_url = "https://"; + account_name + ".blob.core.windows.net/";
+  }
+  storage_credentials_provider =
+      
std::make_shared<Azure::Storage::StorageSharedKeyCredential>(account_name,
+                                                                   
account_key);
+  credentials_kind = AzureCredentialsKind::StorageCredentials;
+  return Status::OK();
+}
+namespace {
+
+// An AzureFileSystem represents a single Azure storage account. AzurePath 
describes a
+// container and path within that storage account.
+struct AzurePath {
+  std::string full_path;
+  std::string container;
+  std::string path_to_file;
+  std::vector<std::string> path_to_file_parts;
+
+  static Result<AzurePath> FromString(const std::string& s) {
+    // Example expected string format: testcontainer/testdir/testfile.txt
+    // container = testcontainer
+    // path_to_file = testdir/testfile.txt
+    // path_to_file_parts = [testdir, testfile.txt]
+    if (internal::IsLikelyUri(s)) {
+      return Status::Invalid(
+          "Expected an Azure object path of the form 'container/path...', got 
a URI: '",
+          s, "'");
+    }
+    auto src = internal::RemoveTrailingSlash(s);
+    auto input_path = std::string(src.data());
+    src = internal::RemoveLeadingSlash(src);
+    auto first_sep = src.find_first_of(internal::kSep);
+    if (first_sep == 0) {
+      return Status::Invalid("Path cannot start with a separator ('", 
input_path, "')");
+    }
+    if (first_sep == std::string::npos) {
+      return AzurePath{std::string(src), std::string(src), "", {}};
+    }
+    AzurePath path;
+    path.full_path = std::string(src);
+    path.container = std::string(src.substr(0, first_sep));
+    path.path_to_file = std::string(src.substr(first_sep + 1));
+    path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file);
+    RETURN_NOT_OK(Validate(path));
+    return path;
+  }
+
+  static Status Validate(const AzurePath& path) {
+    auto status = internal::ValidateAbstractPathParts(path.path_to_file_parts);
+    if (!status.ok()) {
+      return Status::Invalid(status.message(), " in path ", path.full_path);
+    } else {
+      return status;
+    }
+  }
+
+  AzurePath parent() const {
+    DCHECK(has_parent());
+    auto parent = AzurePath{"", container, "", path_to_file_parts};
+    parent.path_to_file_parts.pop_back();
+    parent.path_to_file = 
internal::JoinAbstractPath(parent.path_to_file_parts);
+    if (parent.path_to_file.empty()) {
+      parent.full_path = parent.container;
+    } else {
+      parent.full_path = parent.container + internal::kSep + 
parent.path_to_file;
+    }
+    return parent;
+  }
+
+  bool has_parent() const { return !path_to_file.empty(); }
+
+  bool empty() const { return container.empty() && path_to_file.empty(); }
+
+  bool operator==(const AzurePath& other) const {
+    return container == other.container && path_to_file == other.path_to_file;
+  }
+};
+
+Status PathNotFound(const AzurePath& path) {
+  return ::arrow::fs::internal::PathNotFound(path.full_path);
+}
+
+Status NotAFile(const AzurePath& path) {
+  return ::arrow::fs::internal::NotAFile(path.full_path);
+}
+
+Status ValidateFilePath(const AzurePath& path) {
+  if (path.container.empty()) {
+    return PathNotFound(path);
+  }
+
+  if (path.path_to_file.empty()) {
+    return NotAFile(path);
+  }
+  return Status::OK();
+}
+
+Status ErrorToStatus(const std::string& prefix,
+                     const Azure::Storage::StorageException& exception) {
+  return Status::IOError(prefix, " Azure Error: ", exception.what());
+}
+
+template <typename ObjectResult>
+std::shared_ptr<const KeyValueMetadata> GetObjectMetadata(const ObjectResult& 
result) {
+  auto md = std::make_shared<KeyValueMetadata>();
+  for (auto prop : result) {
+    md->Append(prop.first, prop.second);
+  }
+  return md;
+}
+
+class ObjectInputFile final : public io::RandomAccessFile {
+ public:
+  ObjectInputFile(std::shared_ptr<Azure::Storage::Blobs::BlobClient>& 
blob_client,

Review Comment:
   should remove `&` since you're `std::move`ing it.



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -37,34 +43,331 @@ bool AzureOptions::Equals(const AzureOptions& other) const 
{
           credentials_kind == other.credentials_kind);
 }
 
+Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& 
account_name,
+                                                    const std::string& 
account_key) {
+  if (this->backend == AzureBackend::Azurite) {
+    account_blob_url = "http://127.0.0.1:10000/"; + account_name + "/";
+    account_dfs_url = "http://127.0.0.1:10000/"; + account_name + "/";
+  } else {
+    account_dfs_url = "https://"; + account_name + ".dfs.core.windows.net/";
+    account_blob_url = "https://"; + account_name + ".blob.core.windows.net/";
+  }
+  storage_credentials_provider =
+      
std::make_shared<Azure::Storage::StorageSharedKeyCredential>(account_name,
+                                                                   
account_key);
+  credentials_kind = AzureCredentialsKind::StorageCredentials;
+  return Status::OK();
+}
+namespace {
+
+// An AzureFileSystem represents a single Azure storage account. AzurePath 
describes a
+// container and path within that storage account.
+struct AzurePath {
+  std::string full_path;
+  std::string container;
+  std::string path_to_file;
+  std::vector<std::string> path_to_file_parts;
+
+  static Result<AzurePath> FromString(const std::string& s) {
+    // Example expected string format: testcontainer/testdir/testfile.txt
+    // container = testcontainer
+    // path_to_file = testdir/testfile.txt
+    // path_to_file_parts = [testdir, testfile.txt]
+    if (internal::IsLikelyUri(s)) {
+      return Status::Invalid(
+          "Expected an Azure object path of the form 'container/path...', got 
a URI: '",
+          s, "'");
+    }
+    auto src = internal::RemoveTrailingSlash(s);
+    auto input_path = std::string(src.data());
+    src = internal::RemoveLeadingSlash(src);
+    auto first_sep = src.find_first_of(internal::kSep);
+    if (first_sep == 0) {
+      return Status::Invalid("Path cannot start with a separator ('", 
input_path, "')");
+    }
+    if (first_sep == std::string::npos) {
+      return AzurePath{std::string(src), std::string(src), "", {}};
+    }
+    AzurePath path;
+    path.full_path = std::string(src);
+    path.container = std::string(src.substr(0, first_sep));
+    path.path_to_file = std::string(src.substr(first_sep + 1));
+    path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file);
+    RETURN_NOT_OK(Validate(path));
+    return path;
+  }
+
+  static Status Validate(const AzurePath& path) {
+    auto status = internal::ValidateAbstractPathParts(path.path_to_file_parts);
+    if (!status.ok()) {
+      return Status::Invalid(status.message(), " in path ", path.full_path);
+    } else {
+      return status;
+    }
+  }
+
+  AzurePath parent() const {
+    DCHECK(has_parent());
+    auto parent = AzurePath{"", container, "", path_to_file_parts};
+    parent.path_to_file_parts.pop_back();
+    parent.path_to_file = 
internal::JoinAbstractPath(parent.path_to_file_parts);
+    if (parent.path_to_file.empty()) {
+      parent.full_path = parent.container;
+    } else {
+      parent.full_path = parent.container + internal::kSep + 
parent.path_to_file;
+    }
+    return parent;
+  }
+
+  bool has_parent() const { return !path_to_file.empty(); }
+
+  bool empty() const { return container.empty() && path_to_file.empty(); }
+
+  bool operator==(const AzurePath& other) const {
+    return container == other.container && path_to_file == other.path_to_file;
+  }
+};
+
+Status PathNotFound(const AzurePath& path) {
+  return ::arrow::fs::internal::PathNotFound(path.full_path);
+}
+
+Status NotAFile(const AzurePath& path) {
+  return ::arrow::fs::internal::NotAFile(path.full_path);
+}
+
+Status ValidateFilePath(const AzurePath& path) {
+  if (path.container.empty()) {
+    return PathNotFound(path);
+  }
+
+  if (path.path_to_file.empty()) {
+    return NotAFile(path);
+  }
+  return Status::OK();
+}
+
+Status ErrorToStatus(const std::string& prefix,
+                     const Azure::Storage::StorageException& exception) {
+  return Status::IOError(prefix, " Azure Error: ", exception.what());
+}
+
+template <typename ObjectResult>
+std::shared_ptr<const KeyValueMetadata> GetObjectMetadata(const ObjectResult& 
result) {
+  auto md = std::make_shared<KeyValueMetadata>();
+  for (auto prop : result) {
+    md->Append(prop.first, prop.second);
+  }
+  return md;
+}
+
+class ObjectInputFile final : public io::RandomAccessFile {
+ public:
+  ObjectInputFile(std::shared_ptr<Azure::Storage::Blobs::BlobClient>& 
blob_client,
+                  const io::IOContext& io_context, const AzurePath& path,
+                  int64_t size = kNoSize)
+      : blob_client_(std::move(blob_client)),
+        io_context_(io_context),
+        path_(path),
+        content_length_(size) {}
+
+  Status Init() {
+    if (content_length_ != kNoSize) {
+      DCHECK_GE(content_length_, 0);
+      return Status::OK();
+    }
+    try {
+      auto properties = blob_client_->GetProperties();
+      content_length_ = properties.Value.BlobSize;
+      metadata_ = GetObjectMetadata(properties.Value.Metadata);
+      return Status::OK();
+    } catch (const Azure::Storage::StorageException& exception) {
+      if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) 
{
+        // Could be either container or blob not found.
+        return PathNotFound(path_);
+      }
+      return ErrorToStatus(
+          "When fetching properties for '" + blob_client_->GetUrl() + "': ", 
exception);
+    }
+  }
+
+  Status CheckClosed() const {
+    if (closed_) {
+      return Status::Invalid("Operation on closed stream");
+    }
+    return Status::OK();
+  }
+
+  Status CheckPosition(int64_t position, const char* action) const {
+    if (position < 0) {
+      return Status::Invalid("Cannot ", action, " from negative position");
+    }
+    if (position > content_length_) {
+      return Status::IOError("Cannot ", action, " past end of file");
+    }
+    return Status::OK();
+  }
+
+  // RandomAccessFile APIs
+
+  Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override {
+    return metadata_;
+  }
+
+  Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+      const io::IOContext& io_context) override {
+    return metadata_;
+  }
+
+  Status Close() override {
+    blob_client_ = nullptr;
+    closed_ = true;
+    return Status::OK();
+  }
+
+  bool closed() const override { return closed_; }
+
+  Result<int64_t> Tell() const override {
+    RETURN_NOT_OK(CheckClosed());
+    return pos_;
+  }
+
+  Result<int64_t> GetSize() override {
+    RETURN_NOT_OK(CheckClosed());
+    return content_length_;
+  }
+
+  Status Seek(int64_t position) override {
+    RETURN_NOT_OK(CheckClosed());
+    RETURN_NOT_OK(CheckPosition(position, "seek"));
+
+    pos_ = position;
+    return Status::OK();
+  }
+
+  Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override 
{
+    RETURN_NOT_OK(CheckClosed());
+    RETURN_NOT_OK(CheckPosition(position, "read"));
+
+    nbytes = std::min(nbytes, content_length_ - position);
+    if (nbytes == 0) {
+      return 0;
+    }
+
+    // Read the desired range of bytes
+    Azure::Core::Http::HttpRange range{.Offset = position, .Length = nbytes};
+    Azure::Storage::Blobs::DownloadBlobToOptions download_options{.Range = 
range};
+    try {
+      auto result =
+          blob_client_
+              ->DownloadTo(reinterpret_cast<uint8_t*>(out), nbytes, 
download_options)
+              .Value;
+      return result.ContentRange.Length.Value();

Review Comment:
   I don't know what `Value` contains, but it's being copied here and that 
could be avoided safely by doing:
   
   `return blob_client_->DownloadTo(...).Value.ContentRange.Length.Value();`



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -37,34 +43,331 @@ bool AzureOptions::Equals(const AzureOptions& other) const 
{
           credentials_kind == other.credentials_kind);
 }
 
+Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& 
account_name,
+                                                    const std::string& 
account_key) {
+  if (this->backend == AzureBackend::Azurite) {
+    account_blob_url = "http://127.0.0.1:10000/"; + account_name + "/";
+    account_dfs_url = "http://127.0.0.1:10000/"; + account_name + "/";
+  } else {
+    account_dfs_url = "https://"; + account_name + ".dfs.core.windows.net/";
+    account_blob_url = "https://"; + account_name + ".blob.core.windows.net/";
+  }
+  storage_credentials_provider =
+      
std::make_shared<Azure::Storage::StorageSharedKeyCredential>(account_name,
+                                                                   
account_key);
+  credentials_kind = AzureCredentialsKind::StorageCredentials;
+  return Status::OK();
+}
+namespace {
+
+// An AzureFileSystem represents a single Azure storage account. AzurePath 
describes a
+// container and path within that storage account.
+struct AzurePath {
+  std::string full_path;
+  std::string container;
+  std::string path_to_file;
+  std::vector<std::string> path_to_file_parts;
+
+  static Result<AzurePath> FromString(const std::string& s) {
+    // Example expected string format: testcontainer/testdir/testfile.txt
+    // container = testcontainer
+    // path_to_file = testdir/testfile.txt
+    // path_to_file_parts = [testdir, testfile.txt]
+    if (internal::IsLikelyUri(s)) {
+      return Status::Invalid(
+          "Expected an Azure object path of the form 'container/path...', got 
a URI: '",
+          s, "'");
+    }
+    auto src = internal::RemoveTrailingSlash(s);
+    auto input_path = std::string(src.data());
+    src = internal::RemoveLeadingSlash(src);
+    auto first_sep = src.find_first_of(internal::kSep);
+    if (first_sep == 0) {
+      return Status::Invalid("Path cannot start with a separator ('", 
input_path, "')");
+    }
+    if (first_sep == std::string::npos) {
+      return AzurePath{std::string(src), std::string(src), "", {}};
+    }
+    AzurePath path;
+    path.full_path = std::string(src);
+    path.container = std::string(src.substr(0, first_sep));
+    path.path_to_file = std::string(src.substr(first_sep + 1));
+    path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file);
+    RETURN_NOT_OK(Validate(path));
+    return path;
+  }
+
+  static Status Validate(const AzurePath& path) {
+    auto status = internal::ValidateAbstractPathParts(path.path_to_file_parts);
+    if (!status.ok()) {
+      return Status::Invalid(status.message(), " in path ", path.full_path);
+    } else {
+      return status;
+    }
+  }
+
+  AzurePath parent() const {
+    DCHECK(has_parent());
+    auto parent = AzurePath{"", container, "", path_to_file_parts};
+    parent.path_to_file_parts.pop_back();
+    parent.path_to_file = 
internal::JoinAbstractPath(parent.path_to_file_parts);
+    if (parent.path_to_file.empty()) {
+      parent.full_path = parent.container;
+    } else {
+      parent.full_path = parent.container + internal::kSep + 
parent.path_to_file;
+    }
+    return parent;
+  }
+
+  bool has_parent() const { return !path_to_file.empty(); }
+
+  bool empty() const { return container.empty() && path_to_file.empty(); }
+
+  bool operator==(const AzurePath& other) const {
+    return container == other.container && path_to_file == other.path_to_file;
+  }
+};
+
+Status PathNotFound(const AzurePath& path) {
+  return ::arrow::fs::internal::PathNotFound(path.full_path);
+}
+
+Status NotAFile(const AzurePath& path) {
+  return ::arrow::fs::internal::NotAFile(path.full_path);
+}
+
+Status ValidateFilePath(const AzurePath& path) {
+  if (path.container.empty()) {
+    return PathNotFound(path);
+  }
+
+  if (path.path_to_file.empty()) {
+    return NotAFile(path);
+  }
+  return Status::OK();
+}
+
+Status ErrorToStatus(const std::string& prefix,
+                     const Azure::Storage::StorageException& exception) {
+  return Status::IOError(prefix, " Azure Error: ", exception.what());
+}
+
+template <typename ObjectResult>
+std::shared_ptr<const KeyValueMetadata> GetObjectMetadata(const ObjectResult& 
result) {
+  auto md = std::make_shared<KeyValueMetadata>();
+  for (auto prop : result) {
+    md->Append(prop.first, prop.second);
+  }
+  return md;
+}
+
+class ObjectInputFile final : public io::RandomAccessFile {
+ public:
+  ObjectInputFile(std::shared_ptr<Azure::Storage::Blobs::BlobClient>& 
blob_client,
+                  const io::IOContext& io_context, const AzurePath& path,
+                  int64_t size = kNoSize)
+      : blob_client_(std::move(blob_client)),
+        io_context_(io_context),
+        path_(path),
+        content_length_(size) {}
+
+  Status Init() {
+    if (content_length_ != kNoSize) {
+      DCHECK_GE(content_length_, 0);
+      return Status::OK();
+    }
+    try {
+      auto properties = blob_client_->GetProperties();
+      content_length_ = properties.Value.BlobSize;
+      metadata_ = GetObjectMetadata(properties.Value.Metadata);
+      return Status::OK();
+    } catch (const Azure::Storage::StorageException& exception) {
+      if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) 
{
+        // Could be either container or blob not found.
+        return PathNotFound(path_);
+      }
+      return ErrorToStatus(
+          "When fetching properties for '" + blob_client_->GetUrl() + "': ", 
exception);
+    }
+  }
+
+  Status CheckClosed() const {
+    if (closed_) {
+      return Status::Invalid("Operation on closed stream");
+    }
+    return Status::OK();
+  }

Review Comment:
   If you change the signature to `Status CheckClosed(const char* operation)`, 
you can pass the name of the operation initiating this check and improve the 
error message:
   
   `      return Status::Invalid("Operation on closed stream: ", operation);`



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -37,34 +43,331 @@ bool AzureOptions::Equals(const AzureOptions& other) const 
{
           credentials_kind == other.credentials_kind);
 }
 
+Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& 
account_name,
+                                                    const std::string& 
account_key) {
+  if (this->backend == AzureBackend::Azurite) {
+    account_blob_url = "http://127.0.0.1:10000/"; + account_name + "/";
+    account_dfs_url = "http://127.0.0.1:10000/"; + account_name + "/";
+  } else {
+    account_dfs_url = "https://"; + account_name + ".dfs.core.windows.net/";
+    account_blob_url = "https://"; + account_name + ".blob.core.windows.net/";
+  }
+  storage_credentials_provider =
+      
std::make_shared<Azure::Storage::StorageSharedKeyCredential>(account_name,
+                                                                   
account_key);
+  credentials_kind = AzureCredentialsKind::StorageCredentials;
+  return Status::OK();
+}
+namespace {
+
+// An AzureFileSystem represents a single Azure storage account. AzurePath 
describes a
+// container and path within that storage account.
+struct AzurePath {
+  std::string full_path;
+  std::string container;
+  std::string path_to_file;
+  std::vector<std::string> path_to_file_parts;
+
+  static Result<AzurePath> FromString(const std::string& s) {
+    // Example expected string format: testcontainer/testdir/testfile.txt
+    // container = testcontainer
+    // path_to_file = testdir/testfile.txt
+    // path_to_file_parts = [testdir, testfile.txt]
+    if (internal::IsLikelyUri(s)) {
+      return Status::Invalid(
+          "Expected an Azure object path of the form 'container/path...', got 
a URI: '",
+          s, "'");
+    }
+    auto src = internal::RemoveTrailingSlash(s);
+    auto input_path = std::string(src.data());
+    src = internal::RemoveLeadingSlash(src);
+    auto first_sep = src.find_first_of(internal::kSep);
+    if (first_sep == 0) {
+      return Status::Invalid("Path cannot start with a separator ('", 
input_path, "')");
+    }
+    if (first_sep == std::string::npos) {
+      return AzurePath{std::string(src), std::string(src), "", {}};
+    }
+    AzurePath path;
+    path.full_path = std::string(src);
+    path.container = std::string(src.substr(0, first_sep));
+    path.path_to_file = std::string(src.substr(first_sep + 1));
+    path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file);
+    RETURN_NOT_OK(Validate(path));
+    return path;
+  }
+
+  static Status Validate(const AzurePath& path) {
+    auto status = internal::ValidateAbstractPathParts(path.path_to_file_parts);
+    if (!status.ok()) {
+      return Status::Invalid(status.message(), " in path ", path.full_path);
+    } else {
+      return status;
+    }
+  }
+
+  AzurePath parent() const {
+    DCHECK(has_parent());
+    auto parent = AzurePath{"", container, "", path_to_file_parts};
+    parent.path_to_file_parts.pop_back();
+    parent.path_to_file = 
internal::JoinAbstractPath(parent.path_to_file_parts);
+    if (parent.path_to_file.empty()) {
+      parent.full_path = parent.container;
+    } else {
+      parent.full_path = parent.container + internal::kSep + 
parent.path_to_file;
+    }
+    return parent;
+  }
+
+  bool has_parent() const { return !path_to_file.empty(); }
+
+  bool empty() const { return container.empty() && path_to_file.empty(); }
+
+  bool operator==(const AzurePath& other) const {
+    return container == other.container && path_to_file == other.path_to_file;
+  }
+};
+
+Status PathNotFound(const AzurePath& path) {
+  return ::arrow::fs::internal::PathNotFound(path.full_path);
+}
+
+Status NotAFile(const AzurePath& path) {
+  return ::arrow::fs::internal::NotAFile(path.full_path);
+}
+
+Status ValidateFilePath(const AzurePath& path) {
+  if (path.container.empty()) {
+    return PathNotFound(path);
+  }
+
+  if (path.path_to_file.empty()) {
+    return NotAFile(path);
+  }
+  return Status::OK();
+}
+
+Status ErrorToStatus(const std::string& prefix,
+                     const Azure::Storage::StorageException& exception) {
+  return Status::IOError(prefix, " Azure Error: ", exception.what());
+}
+
+template <typename ObjectResult>
+std::shared_ptr<const KeyValueMetadata> GetObjectMetadata(const ObjectResult& 
result) {
+  auto md = std::make_shared<KeyValueMetadata>();
+  for (auto prop : result) {
+    md->Append(prop.first, prop.second);
+  }
+  return md;
+}
+
+class ObjectInputFile final : public io::RandomAccessFile {
+ public:
+  ObjectInputFile(std::shared_ptr<Azure::Storage::Blobs::BlobClient>& 
blob_client,
+                  const io::IOContext& io_context, const AzurePath& path,
+                  int64_t size = kNoSize)
+      : blob_client_(std::move(blob_client)),
+        io_context_(io_context),
+        path_(path),
+        content_length_(size) {}
+
+  Status Init() {
+    if (content_length_ != kNoSize) {
+      DCHECK_GE(content_length_, 0);
+      return Status::OK();
+    }
+    try {
+      auto properties = blob_client_->GetProperties();
+      content_length_ = properties.Value.BlobSize;
+      metadata_ = GetObjectMetadata(properties.Value.Metadata);
+      return Status::OK();
+    } catch (const Azure::Storage::StorageException& exception) {
+      if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) 
{
+        // Could be either container or blob not found.
+        return PathNotFound(path_);
+      }
+      return ErrorToStatus(
+          "When fetching properties for '" + blob_client_->GetUrl() + "': ", 
exception);
+    }
+  }
+
+  Status CheckClosed() const {
+    if (closed_) {
+      return Status::Invalid("Operation on closed stream");
+    }
+    return Status::OK();
+  }
+
+  Status CheckPosition(int64_t position, const char* action) const {
+    if (position < 0) {
+      return Status::Invalid("Cannot ", action, " from negative position");
+    }
+    if (position > content_length_) {
+      return Status::IOError("Cannot ", action, " past end of file");

Review Comment:
   `DCHECK` that `content_length_` is not `kNoSize` still.



##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -154,6 +137,332 @@ TEST(AzureFileSystem, OptionsCompare) {
   EXPECT_TRUE(options.Equals(options));
 }
 
+class TestAzureFileSystem : public ::testing::Test {
+ public:
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<Azure::Storage::Blobs::BlobServiceClient> service_client_;
+  AzureOptions options_;
+  std::mt19937_64 generator_;
+  std::string container_name_;
+
+  void MakeFileSystem() {

Review Comment:
   I suggest calling this `MakeOptions` and having it return an `AzureOptions` 
object.
   
   In test code we don't have to care as much about re-use of the options 
object that does contain heap-allocated fields (every `std::string`) unless 
they are big.



##########
cpp/src/arrow/filesystem/azurefs_test.cc:
##########
@@ -154,6 +137,332 @@ TEST(AzureFileSystem, OptionsCompare) {
   EXPECT_TRUE(options.Equals(options));
 }
 
+class TestAzureFileSystem : public ::testing::Test {
+ public:
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<Azure::Storage::Blobs::BlobServiceClient> service_client_;
+  AzureOptions options_;
+  std::mt19937_64 generator_;
+  std::string container_name_;
+
+  void MakeFileSystem() {
+    const std::string& account_name = GetAzuriteEnv()->account_name();
+    const std::string& account_key = GetAzuriteEnv()->account_key();
+    options_.backend = AzureBackend::Azurite;
+    ASSERT_OK(options_.ConfigureAccountKeyCredentials(account_name, 
account_key));
+  }
+
+  void SetUp() override {
+    ASSERT_THAT(GetAzuriteEnv(), NotNull());
+    ASSERT_OK(GetAzuriteEnv()->status());
+
+    MakeFileSystem();
+    generator_ = std::mt19937_64(std::random_device()());
+    container_name_ = RandomChars(32);
+    service_client_ = 
std::make_shared<Azure::Storage::Blobs::BlobServiceClient>(
+        options_.account_blob_url, options_.storage_credentials_provider);
+    ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_));
+    auto container_client = 
service_client_->GetBlobContainerClient(container_name_);
+    container_client.CreateIfNotExists();
+
+    auto blob_client = 
container_client.GetBlockBlobClient(PreexistingObjectName());
+    blob_client.UploadFrom(reinterpret_cast<const uint8_t*>(kLoremIpsum),
+                           strlen(kLoremIpsum));
+  }
+
+  void TearDown() override {
+    auto containers = service_client_->ListBlobContainers();
+    for (auto container : containers.BlobContainers) {
+      auto container_client = 
service_client_->GetBlobContainerClient(container.Name);
+      container_client.DeleteIfExists();
+    }
+  }
+
+  std::string PreexistingContainerName() const { return container_name_; }
+
+  std::string PreexistingContainerPath() const {
+    return PreexistingContainerName() + '/';
+  }
+
+  static std::string PreexistingObjectName() { return "test-object-name"; }
+
+  std::string PreexistingObjectPath() const {
+    return PreexistingContainerPath() + PreexistingObjectName();
+  }
+
+  std::string NotFoundObjectPath() { return PreexistingContainerPath() + 
"not-found"; }
+
+  std::string RandomLine(int lineno, std::size_t width) {
+    auto line = std::to_string(lineno) + ":    ";
+    line += RandomChars(width - line.size() - 1);
+    line += '\n';
+    return line;
+  }
+
+  uint8_t RandomInteger() {
+    return std::uniform_int_distribution<std::uint8_t>()(generator_);
+  }
+
+  std::size_t RandomIndex(std::size_t end) {
+    return std::uniform_int_distribution<std::size_t>(0, end - 1)(generator_);
+  }
+
+  std::string RandomChars(std::size_t count) {
+    auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789");
+    std::uniform_int_distribution<std::size_t> d(0, fillers.size() - 1);
+    std::string s;
+    std::generate_n(std::back_inserter(s), count, [&] { return 
fillers[d(generator_)]; });
+    return s;
+  }
+
+  void UploadLines(std::vector<std::string> lines, const char* path_to_file,

Review Comment:
   Pass `const std::vector<std::string> &lines` to avoid a copy.



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -37,34 +43,331 @@ bool AzureOptions::Equals(const AzureOptions& other) const 
{
           credentials_kind == other.credentials_kind);
 }
 
+Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& 
account_name,
+                                                    const std::string& 
account_key) {
+  if (this->backend == AzureBackend::Azurite) {
+    account_blob_url = "http://127.0.0.1:10000/"; + account_name + "/";
+    account_dfs_url = "http://127.0.0.1:10000/"; + account_name + "/";
+  } else {
+    account_dfs_url = "https://"; + account_name + ".dfs.core.windows.net/";
+    account_blob_url = "https://"; + account_name + ".blob.core.windows.net/";
+  }
+  storage_credentials_provider =
+      
std::make_shared<Azure::Storage::StorageSharedKeyCredential>(account_name,
+                                                                   
account_key);
+  credentials_kind = AzureCredentialsKind::StorageCredentials;
+  return Status::OK();
+}
+namespace {
+
+// An AzureFileSystem represents a single Azure storage account. AzurePath 
describes a
+// container and path within that storage account.
+struct AzurePath {
+  std::string full_path;
+  std::string container;
+  std::string path_to_file;
+  std::vector<std::string> path_to_file_parts;

Review Comment:
   The effort needing in keeping this consistent with the others *might* mean 
it's better to not have it as a cached field.



##########
cpp/src/arrow/filesystem/azurefs.cc:
##########
@@ -37,34 +43,331 @@ bool AzureOptions::Equals(const AzureOptions& other) const 
{
           credentials_kind == other.credentials_kind);
 }
 
+Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& 
account_name,
+                                                    const std::string& 
account_key) {
+  if (this->backend == AzureBackend::Azurite) {
+    account_blob_url = "http://127.0.0.1:10000/"; + account_name + "/";
+    account_dfs_url = "http://127.0.0.1:10000/"; + account_name + "/";
+  } else {
+    account_dfs_url = "https://"; + account_name + ".dfs.core.windows.net/";
+    account_blob_url = "https://"; + account_name + ".blob.core.windows.net/";
+  }
+  storage_credentials_provider =
+      
std::make_shared<Azure::Storage::StorageSharedKeyCredential>(account_name,
+                                                                   
account_key);
+  credentials_kind = AzureCredentialsKind::StorageCredentials;
+  return Status::OK();
+}
+namespace {
+
+// An AzureFileSystem represents a single Azure storage account. AzurePath 
describes a
+// container and path within that storage account.
+struct AzurePath {
+  std::string full_path;
+  std::string container;
+  std::string path_to_file;
+  std::vector<std::string> path_to_file_parts;
+
+  static Result<AzurePath> FromString(const std::string& s) {
+    // Example expected string format: testcontainer/testdir/testfile.txt
+    // container = testcontainer
+    // path_to_file = testdir/testfile.txt
+    // path_to_file_parts = [testdir, testfile.txt]
+    if (internal::IsLikelyUri(s)) {
+      return Status::Invalid(
+          "Expected an Azure object path of the form 'container/path...', got 
a URI: '",
+          s, "'");
+    }
+    auto src = internal::RemoveTrailingSlash(s);
+    auto input_path = std::string(src.data());
+    src = internal::RemoveLeadingSlash(src);
+    auto first_sep = src.find_first_of(internal::kSep);
+    if (first_sep == 0) {
+      return Status::Invalid("Path cannot start with a separator ('", 
input_path, "')");
+    }
+    if (first_sep == std::string::npos) {
+      return AzurePath{std::string(src), std::string(src), "", {}};
+    }
+    AzurePath path;
+    path.full_path = std::string(src);
+    path.container = std::string(src.substr(0, first_sep));
+    path.path_to_file = std::string(src.substr(first_sep + 1));
+    path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file);
+    RETURN_NOT_OK(Validate(path));
+    return path;
+  }
+
+  static Status Validate(const AzurePath& path) {
+    auto status = internal::ValidateAbstractPathParts(path.path_to_file_parts);
+    if (!status.ok()) {
+      return Status::Invalid(status.message(), " in path ", path.full_path);
+    } else {
+      return status;
+    }
+  }
+
+  AzurePath parent() const {
+    DCHECK(has_parent());
+    auto parent = AzurePath{"", container, "", path_to_file_parts};
+    parent.path_to_file_parts.pop_back();
+    parent.path_to_file = 
internal::JoinAbstractPath(parent.path_to_file_parts);
+    if (parent.path_to_file.empty()) {
+      parent.full_path = parent.container;
+    } else {
+      parent.full_path = parent.container + internal::kSep + 
parent.path_to_file;
+    }
+    return parent;
+  }
+
+  bool has_parent() const { return !path_to_file.empty(); }
+
+  bool empty() const { return container.empty() && path_to_file.empty(); }
+
+  bool operator==(const AzurePath& other) const {
+    return container == other.container && path_to_file == other.path_to_file;
+  }
+};
+
+Status PathNotFound(const AzurePath& path) {
+  return ::arrow::fs::internal::PathNotFound(path.full_path);
+}
+
+Status NotAFile(const AzurePath& path) {
+  return ::arrow::fs::internal::NotAFile(path.full_path);
+}
+
+Status ValidateFilePath(const AzurePath& path) {
+  if (path.container.empty()) {
+    return PathNotFound(path);
+  }
+
+  if (path.path_to_file.empty()) {
+    return NotAFile(path);
+  }
+  return Status::OK();
+}
+
+Status ErrorToStatus(const std::string& prefix,
+                     const Azure::Storage::StorageException& exception) {
+  return Status::IOError(prefix, " Azure Error: ", exception.what());
+}
+
+template <typename ObjectResult>
+std::shared_ptr<const KeyValueMetadata> GetObjectMetadata(const ObjectResult& 
result) {
+  auto md = std::make_shared<KeyValueMetadata>();
+  for (auto prop : result) {
+    md->Append(prop.first, prop.second);
+  }
+  return md;
+}
+
+class ObjectInputFile final : public io::RandomAccessFile {
+ public:
+  ObjectInputFile(std::shared_ptr<Azure::Storage::Blobs::BlobClient>& 
blob_client,
+                  const io::IOContext& io_context, const AzurePath& path,
+                  int64_t size = kNoSize)
+      : blob_client_(std::move(blob_client)),
+        io_context_(io_context),
+        path_(path),

Review Comment:
   You can change parameter from `const ..&` to `AzurePath path` and 
`path(std::move(path))` here to support callers of the constructors to call it 
both by copying the path or by `std::move`-ing the path. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to