Re: [PR] refactor(theta): introduce ThetaSketchView trait and enhance ThetaHashTable used for set operation (datasketches-rust)

via GitHub Fri, 20 Feb 2026 04:42:41 -0800


Copilot commented on code in PR #100:
URL: https://github.com/apache/datasketches-rust/pull/100#discussion_r2833017231



##########
datasketches/src/theta/sketch.rs:
##########
@@ -224,13 +252,7 @@ impl ThetaSketch {
             entries.sort_unstable();
         }
 
-        CompactThetaSketch {
-            entries,
-            theta,
-            seed_hash: self.table.seed_hash(),
-            ordered,
-            empty,
-        }
+        CompactThetaSketch::from_parts(entries, theta, self.table.seed_hash(), 
ordered, empty)

Review Comment:
   The refactoring to use `from_parts` is good, but note that the `empty` 
parameter passed to `from_parts` is computed from `entries.is_empty()` (line 
239), which checks if there are retained hashes. However, with the new 
`is_empty` semantics in ThetaHashTable that tracks logical emptiness (whether 
any updates were attempted), there's now a potential inconsistency: if a 
ThetaSketch has been updated but all values were screened out, 
`self.table.is_empty()` would return false, but the CompactThetaSketch created 
here would have `empty = true`. This may be intentional for CompactThetaSketch, 
but it creates a semantic difference between `sketch.is_empty()` and 
`sketch.compact(false).is_empty()`.



##########
datasketches/src/theta/hash_table.rs:
##########
@@ -344,46 +385,43 @@ mod tests {
             starting_sub_multiple(8 + 1, MIN_LG_K, ResizeFactor::X8.lg_value())
         );
         assert_eq!(table.theta, starting_theta_from_sampling_probability(1.0));
-        assert_eq!(table.num_entries(), 0);
+        assert_eq!(table.num_retained(), 0);
         assert!(table.is_empty());
         assert_eq!(table.iter().count(), 0);
     }
 
     #[test]
-    fn test_hash_and_screen() {
+    fn test_hash_and_theta_screen_behavior() {
         let mut table = ThetaHashTable::new(8, ResizeFactor::X8, 1.0, 
DEFAULT_UPDATE_SEED);
 
-        // With MAX_THETA, all hashes should pass
-        let hash1 = table.hash_and_screen("test1");
-        let hash2 = table.hash_and_screen("test2");
+        // With MAX_THETA, hashes are computed normally.
+        let hash1 = table.hash("test1");
+        let hash2 = table.hash("test2");
         assert_ne!(hash1, 0);
         assert_ne!(hash2, 0);
         assert_ne!(hash1, hash2);
 
-        // With low theta, some hashes should be filtered
+        // With low theta, update should be screened out.
         table.theta = 1;
-        let hash3 = table.hash_and_screen("test3");
-        assert_eq!(hash3, 0);
+        assert!(!table.try_insert("test3"));
     }
 
     #[test]
     fn test_try_insert() {
         let mut table = ThetaHashTable::new(5, ResizeFactor::X8, 1.0, 
DEFAULT_UPDATE_SEED);
 
-        // Insert a hash value
-        let hash = table.hash_and_screen("test_value");
-        assert_ne!(hash, 0);
-        assert!(table.try_insert(hash));
-        assert_eq!(table.num_entries(), 1);
+        assert!(table.try_insert("test_value"));
+        assert_eq!(table.num_retained(), 1);
         assert!(!table.is_empty());
 
-        // Try to insert the same hash again (should fail)
-        assert!(!table.try_insert(hash));
-        assert_eq!(table.num_entries(), 1);
+        // Try to insert the same value again (should fail)
+        assert!(!table.try_insert("test_value"));
+        assert_eq!(table.num_retained(), 1);
 
-        // Try to insert 0 (should fail)
-        assert!(!table.try_insert(0));
-        assert_eq!(table.num_entries(), 1);
+        // Force screening and verify insertion fails
+        table.theta = 0;
+        assert!(!table.try_insert("screened"));
+        assert_eq!(table.num_retained(), 1);

Review Comment:
   After setting theta to 0, the table should still be considered non-empty 
because "screened" was attempted to be inserted (even though it was screened 
out). However, this test doesn't verify that is_empty remains false after the 
failed insertion. Consider adding an assertion like 
`assert!(!table.is_empty());` after line 424 to verify the semantic distinction 
between logical emptiness and having zero retained entries.
   ```suggestion
           assert_eq!(table.num_retained(), 1);
           assert!(!table.is_empty());
   ```



##########
datasketches/src/theta/hash_table.rs:
##########
@@ -61,10 +61,16 @@ pub(crate) struct ThetaHashTable {
     sampling_probability: f32,
     hash_seed: u64,
 
+    // Logical emptiness of the source set. This can be false even when 
`num_retained` is 0 (e.g.
+    // all updates screened by theta).

Review Comment:
   The comment for the `is_empty` field could be more precise. Consider 
rephrasing to: "Logical emptiness of the source set. False if any update has 
been attempted (even if screened by theta), true if no updates have been 
attempted. This can be false even when `num_retained` is 0." This makes it 
clearer that it tracks update attempts, not just screening.
   ```suggestion
       // Logical emptiness of the source set. False if any update has been 
attempted (even if
       // screened by theta), true if no updates have been attempted. This can 
be false even when
       // `num_retained` is 0.
   ```



##########
datasketches/src/theta/hash_table.rs:
##########
@@ -77,34 +83,58 @@ impl ThetaHashTable {
     ) -> Self {
         let lg_max_size = lg_nom_size + 1;
         let lg_cur_size = starting_sub_multiple(lg_max_size, MIN_LG_K, 
resize_factor.lg_value());
+        Self::new_with_state(
+            lg_cur_size,
+            lg_nom_size,
+            resize_factor,
+            sampling_probability,
+            starting_theta_from_sampling_probability(sampling_probability),
+            hash_seed,
+            true,
+        )
+    }
+
+    /// Create a table with explicit state.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `lg_cur_size > lg_nom_size + 1`. (`lg_nom_size + 1 == 
lg_max_size`)
+    pub fn new_with_state(
+        lg_cur_size: u8,
+        lg_nom_size: u8,
+        resize_factor: ResizeFactor,
+        sampling_probability: f32,
+        theta: u64,
+        hash_seed: u64,
+        is_empty: bool,
+    ) -> Self {
+        let lg_max_size = lg_nom_size + 1;
+        assert!(
+            lg_cur_size <= lg_max_size,
+            "lg_cur_size must be <= lg_nom_size + 1, got 
lg_cur_size={lg_cur_size}, lg_nom_size={lg_nom_size}"
+        );
         let size = if lg_cur_size > 0 { 1 << lg_cur_size } else { 0 };
         let entries = vec![0u64; size];
-
         Self {
             lg_cur_size,
             lg_nom_size,
             lg_max_size,
             resize_factor,
             sampling_probability,
-            theta: 
starting_theta_from_sampling_probability(sampling_probability),
             hash_seed,
+            is_empty,
+            theta,
             entries,
-            num_entries: 0,
+            num_retained: 0,
         }
     }
 
-    /// Hash and screen a value
-    ///
-    /// Returns the hash value if it passes the theta threshold, otherwise 0.
-    pub fn hash_and_screen<T: Hash>(&mut self, value: T) -> u64 {
+    /// Hash a value with the table seed and return the hash.

Review Comment:
   The documentation for the `hash` method should clarify that it only computes 
the hash value without applying theta screening. This is an important 
distinction from the previous `hash_and_screen` method. Consider adding a note 
like: "Note: This method only computes the hash and does not apply theta 
screening. Use `try_insert` or `try_insert_hash` for insertion with screening."
   ```suggestion
       /// Hash a value with the table seed and return the hash.
       ///
       /// Note: This method only computes the hash and does not apply theta 
screening.
       /// For insertion with theta screening, use [`try_insert`] or 
[`try_insert_hash`].
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] refactor(theta): introduce ThetaSketchView trait and enhance ThetaHashTable used for set operation (datasketches-rust)

Reply via email to