This is an automated email from the ASF dual-hosted git repository.
xushiyan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/hudi-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 0724b7f docs: add in-code docs to hudi-core APIs (#166)
0724b7f is described below
commit 0724b7fda377ca5166cced9a2e6493346513befe
Author: Shiyan Xu <[email protected]>
AuthorDate: Sat Oct 12 11:03:31 2024 -1000
docs: add in-code docs to hudi-core APIs (#166)
---
crates/core/src/config/internal.rs | 7 ++-----
crates/core/src/config/mod.rs | 4 ++++
crates/core/src/config/read.rs | 7 ++-----
crates/core/src/config/table.rs | 6 ++----
crates/core/src/config/utils.rs | 4 ++++
crates/core/src/file_group/mod.rs | 2 ++
crates/core/src/file_group/reader.rs | 1 +
crates/core/src/lib.rs | 11 +++--------
crates/core/src/storage/file_info.rs | 1 +
crates/core/src/storage/file_stats.rs | 1 +
crates/core/src/storage/utils.rs | 5 +++++
crates/core/src/table/fs_view.rs | 2 ++
crates/core/src/table/partition.rs | 7 +++++++
crates/core/src/table/timeline.rs | 3 +++
14 files changed, 39 insertions(+), 22 deletions(-)
diff --git a/crates/core/src/config/internal.rs
b/crates/core/src/config/internal.rs
index 42641f5..89fd314 100644
--- a/crates/core/src/config/internal.rs
+++ b/crates/core/src/config/internal.rs
@@ -31,14 +31,11 @@ use crate::config::{ConfigParser, HudiConfigValue};
/// **Example**
///
/// ```rust
-/// use url::Url;
-/// use hudi_core::config::HudiConfigValue;
/// use hudi_core::config::internal::HudiInternalConfig::SkipConfigValidation;
/// use hudi_core::table::Table as HudiTable;
///
-/// let options = vec![(SkipConfigValidation.as_ref(),
HudiConfigValue::Boolean(true))];
-/// let base_uri = Url::from_file_path("/tmp/hudi_data").unwrap();
-/// HudiTable::new_with_options(base_uri.as_ref(), options);
+/// let options = [(SkipConfigValidation, "true")];
+/// HudiTable::new_with_options("/tmp/hudi_data", options)
/// ```
///
#[derive(Clone, Debug, PartialEq, Eq, Hash, EnumIter)]
diff --git a/crates/core/src/config/mod.rs b/crates/core/src/config/mod.rs
index be6136f..7b557b4 100644
--- a/crates/core/src/config/mod.rs
+++ b/crates/core/src/config/mod.rs
@@ -97,6 +97,8 @@ impl HudiConfigValue {
T::from(self)
}
+ /// A convenience method to convert [HudiConfigValue] to [Url] when the
value is a [String] and is intended to be a URL.
+ /// Panic if the value is not a [String].
pub fn to_url(self) -> Result<Url> {
match self {
HudiConfigValue::String(v) => parse_uri(&v),
@@ -192,10 +194,12 @@ impl HudiConfigs {
self.raw_options.as_ref().clone()
}
+ /// Validate the associated config using the given parser by execute the
[ConfigParser::validate] method.
pub fn validate(&self, parser: impl ConfigParser<Output =
HudiConfigValue>) -> Result<()> {
parser.validate(&self.raw_options)
}
+ /// Check if the given key exists in the configs.
pub fn contains(&self, key: impl AsRef<str>) -> bool {
self.raw_options.contains_key(key.as_ref())
}
diff --git a/crates/core/src/config/read.rs b/crates/core/src/config/read.rs
index 807a0cf..e67617d 100644
--- a/crates/core/src/config/read.rs
+++ b/crates/core/src/config/read.rs
@@ -30,14 +30,11 @@ use strum_macros::EnumIter;
/// **Example**
///
/// ```rust
-/// use url::Url;
/// use hudi_core::config::read::HudiReadConfig::{AsOfTimestamp,
InputPartitions};
/// use hudi_core::table::Table as HudiTable;
///
-/// let options = vec![(InputPartitions.as_ref(), "2"),
-/// (AsOfTimestamp.as_ref(), "20240101010100000")];
-/// let base_uri = Url::from_file_path("/tmp/hudi_data").unwrap();
-/// HudiTable::new_with_options(base_uri.as_ref(), options);
+/// let options = [(InputPartitions, "2"), (AsOfTimestamp,
"20240101010100000")];
+/// HudiTable::new_with_options("/tmp/hudi_data", options)
/// ```
///
#[derive(Clone, Debug, PartialEq, Eq, Hash, EnumIter)]
diff --git a/crates/core/src/config/table.rs b/crates/core/src/config/table.rs
index 107e382..98cca7d 100644
--- a/crates/core/src/config/table.rs
+++ b/crates/core/src/config/table.rs
@@ -32,13 +32,11 @@ use crate::config::{ConfigParser, HudiConfigValue};
/// **Example**
///
/// ```rust
-/// use url::Url;
/// use hudi_core::config::table::HudiTableConfig::BaseFileFormat;
/// use hudi_core::table::Table as HudiTable;
///
-/// let options = vec![(BaseFileFormat.as_ref(), "parquet")];
-/// let base_uri = Url::from_file_path("/tmp/hudi_data").unwrap();
-/// HudiTable::new_with_options(base_uri.as_ref(), options);
+/// let options = [(BaseFileFormat, "parquet")];
+/// HudiTable::new_with_options("/tmp/hudi_data", options);
/// ```
#[derive(Clone, Debug, PartialEq, Eq, Hash, EnumIter)]
pub enum HudiTableConfig {
diff --git a/crates/core/src/config/utils.rs b/crates/core/src/config/utils.rs
index 98ea6b1..800a81d 100644
--- a/crates/core/src/config/utils.rs
+++ b/crates/core/src/config/utils.rs
@@ -16,16 +16,19 @@
* specific language governing permissions and limitations
* under the License.
*/
+//! Config utilities.
use anyhow::{Context, Result};
use bytes::Bytes;
use std::collections::HashMap;
use std::io::{BufRead, BufReader, Cursor};
+/// Returns an empty iterator to represent an empty set of options.
pub fn empty_options<'a>() -> std::iter::Empty<(&'a str, &'a str)> {
std::iter::empty::<(&str, &str)>()
}
+/// Splits the given options into two maps: one for Hudi options, and the
other for others, which could be storage options for example.
pub fn split_hudi_options_from_others<I, K, V>(
all_options: I,
) -> (HashMap<String, String>, HashMap<String, String>)
@@ -47,6 +50,7 @@ where
(hudi_options, others)
}
+/// Parses the given data into a map of options.
pub fn parse_data_for_options(data: &Bytes, split_chars: &str) ->
Result<HashMap<String, String>> {
let cursor = Cursor::new(data);
let lines = BufReader::new(cursor).lines();
diff --git a/crates/core/src/file_group/mod.rs
b/crates/core/src/file_group/mod.rs
index 6cd1248..db96b24 100644
--- a/crates/core/src/file_group/mod.rs
+++ b/crates/core/src/file_group/mod.rs
@@ -114,6 +114,7 @@ impl FileSlice {
self.base_file = base_file
}
+ /// Load stats from storage layer for the base file if not already loaded.
pub async fn load_stats(&mut self, storage: &Storage) -> Result<()> {
if self.base_file.stats.is_none() {
let parquet_meta = storage
@@ -135,6 +136,7 @@ impl FileSlice {
}
}
+/// Hudi File Group.
#[derive(Clone, Debug)]
pub struct FileGroup {
pub id: String,
diff --git a/crates/core/src/file_group/reader.rs
b/crates/core/src/file_group/reader.rs
index 0c97dd5..1c7d748 100644
--- a/crates/core/src/file_group/reader.rs
+++ b/crates/core/src/file_group/reader.rs
@@ -25,6 +25,7 @@ use anyhow::Result;
use arrow_array::RecordBatch;
use std::sync::Arc;
+/// File group reader handles all read operations against a file group.
#[derive(Clone, Debug)]
pub struct FileGroupReader {
storage: Arc<Storage>,
diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs
index 3369101..3e15dee 100644
--- a/crates/core/src/lib.rs
+++ b/crates/core/src/lib.rs
@@ -23,14 +23,11 @@
//! **Example**
//!
//! ```rust
-//! use url::Url;
//! use hudi_core::config::read::HudiReadConfig::{AsOfTimestamp,
InputPartitions};
//! use hudi_core::table::Table as HudiTable;
//!
-//! let options = vec![(InputPartitions.as_ref(), "2"),
-//! (AsOfTimestamp.as_ref(), "20240101010100000")];
-//! let base_uri = Url::from_file_path("/tmp/hudi_data").unwrap();
-//! HudiTable::new_with_options(base_uri.as_ref(), options);
+//! let options = [(InputPartitions, "2"), (AsOfTimestamp,
"20240101010100000")];
+//! HudiTable::new_with_options("/tmp/hudi_data", options);
//! ```
//!
//! # The [table] module is responsible for managing Hudi tables.
@@ -39,12 +36,10 @@
//!
//! create hudi table
//! ```rust
-//! use url::Url;
//! use hudi_core::table::Table;
//!
//! pub async fn test() {
-//! let base_uri = Url::from_file_path("/tmp/hudi_data").unwrap();
-//! let hudi_table = Table::new(base_uri.path()).await.unwrap();
+//! let hudi_table = Table::new("/tmp/hudi_data").await.unwrap();
//! }
//! ```
diff --git a/crates/core/src/storage/file_info.rs
b/crates/core/src/storage/file_info.rs
index 8a77048..a6f1e05 100644
--- a/crates/core/src/storage/file_info.rs
+++ b/crates/core/src/storage/file_info.rs
@@ -17,6 +17,7 @@
* under the License.
*/
+/// File info that can be retrieved by listing operations without reading the
file.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct FileInfo {
pub uri: String,
diff --git a/crates/core/src/storage/file_stats.rs
b/crates/core/src/storage/file_stats.rs
index b0d2bcb..65fe1c5 100644
--- a/crates/core/src/storage/file_stats.rs
+++ b/crates/core/src/storage/file_stats.rs
@@ -17,6 +17,7 @@
* under the License.
*/
+/// File stats that can be retrieved by reading the file's metadata.
#[derive(Clone, Debug, Default)]
pub struct FileStats {
pub num_records: i64,
diff --git a/crates/core/src/storage/utils.rs b/crates/core/src/storage/utils.rs
index ba670f2..2413613 100644
--- a/crates/core/src/storage/utils.rs
+++ b/crates/core/src/storage/utils.rs
@@ -16,12 +16,14 @@
* specific language governing permissions and limitations
* under the License.
*/
+//! Utility functions for storage.
use std::path::{Path, PathBuf};
use std::str::FromStr;
use anyhow::{anyhow, Result};
use url::{ParseError, Url};
+/// Splits a filename into a stem and an extension.
pub fn split_filename(filename: &str) -> Result<(String, String)> {
let path = Path::new(filename);
@@ -40,6 +42,7 @@ pub fn split_filename(filename: &str) -> Result<(String,
String)> {
Ok((stem, extension))
}
+/// Parses a URI string into a URL.
pub fn parse_uri(uri: &str) -> Result<Url> {
let mut url = Url::parse(uri)
.or(Url::from_file_path(PathBuf::from_str(uri)?))
@@ -54,10 +57,12 @@ pub fn parse_uri(uri: &str) -> Result<Url> {
Ok(url)
}
+/// Returns the scheme and authority of a URL in the form of
`scheme://authority`.
pub fn get_scheme_authority(url: &Url) -> String {
format!("{}://{}", url.scheme(), url.authority())
}
+/// Joins a base URL with a list of segments.
pub fn join_url_segments(base_url: &Url, segments: &[&str]) -> Result<Url> {
let mut url = base_url.clone();
diff --git a/crates/core/src/table/fs_view.rs b/crates/core/src/table/fs_view.rs
index 2278d6c..b67534a 100644
--- a/crates/core/src/table/fs_view.rs
+++ b/crates/core/src/table/fs_view.rs
@@ -29,6 +29,8 @@ use anyhow::Result;
use dashmap::DashMap;
use futures::stream::{self, StreamExt, TryStreamExt};
+/// A view of the Hudi table's data files (files stored outside the `.hoodie/`
directory) in the file system. It provides APIs to load and
+/// access the file groups and file slices.
#[derive(Clone, Debug)]
#[allow(dead_code)]
pub struct FileSystemView {
diff --git a/crates/core/src/table/partition.rs
b/crates/core/src/table/partition.rs
index 7dcf482..16b9a99 100644
--- a/crates/core/src/table/partition.rs
+++ b/crates/core/src/table/partition.rs
@@ -30,6 +30,7 @@ use std::collections::HashMap;
use std::str::FromStr;
use std::sync::Arc;
+/// A partition pruner that filters partitions based on the partition path and
its filters.
#[derive(Debug, Clone)]
pub struct PartitionPruner {
schema: Arc<Schema>,
@@ -64,6 +65,7 @@ impl PartitionPruner {
})
}
+ /// Creates an empty partition pruner that does not filter any partitions.
pub fn empty() -> Self {
PartitionPruner {
schema: Arc::new(Schema::empty()),
@@ -73,10 +75,12 @@ impl PartitionPruner {
}
}
+ /// Returns `true` if the partition pruner does not have any filters.
pub fn is_empty(&self) -> bool {
self.and_filters.is_empty()
}
+ /// Returns `true` if the partition path should be included based on the
filters.
pub fn should_include(&self, partition_path: &str) -> bool {
let segments = match self.parse_segments(partition_path) {
Ok(s) => s,
@@ -151,6 +155,7 @@ impl PartitionPruner {
}
}
+/// An operator that represents a comparison operation used in a partition
filter expression.
#[derive(Debug, Clone, Copy, PartialEq)]
enum Operator {
Eq,
@@ -171,6 +176,7 @@ impl Operator {
(">=", Operator::Gte),
];
+ /// Returns the supported operator tokens. Note that the tokens are sorted
by length in descending order to facilitate parsing.
fn supported_tokens() -> &'static [&'static str] {
static TOKENS: Lazy<Vec<&'static str>> = Lazy::new(|| {
let mut tokens: Vec<&'static str> = Operator::TOKEN_OP_PAIRS
@@ -195,6 +201,7 @@ impl FromStr for Operator {
}
}
+/// A partition filter that represents a filter expression for partition
pruning.
#[derive(Debug, Clone)]
pub struct PartitionFilter {
field: Field,
diff --git a/crates/core/src/table/timeline.rs
b/crates/core/src/table/timeline.rs
index 6dc9df5..2993c95 100644
--- a/crates/core/src/table/timeline.rs
+++ b/crates/core/src/table/timeline.rs
@@ -33,6 +33,7 @@ use crate::file_group::FileGroup;
use crate::storage::utils::split_filename;
use crate::storage::Storage;
+/// The [State] of an [Instant] represents the status of the action performed
on the table.
#[allow(dead_code)]
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum State {
@@ -41,6 +42,7 @@ pub enum State {
Completed,
}
+/// An [Instant] represents a point in time when an action was performed on
the table.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Instant {
state: State,
@@ -87,6 +89,7 @@ impl Instant {
}
}
+/// A [Timeline] contains transaction logs of all actions performed on the
table at different [Instant]s of time.
#[derive(Clone, Debug)]
#[allow(dead_code)]
pub struct Timeline {