Copilot commented on code in PR #2305:
URL:
https://github.com/apache/incubator-pegasus/pull/2305#discussion_r2436532417
##########
python-client/pypegasus/base/ttypes.py:
##########
@@ -44,10 +44,19 @@ def write(self, oprot):
def validate(self):
return
+
+ def raw(self):
+ if self._is_str:
+ return self.data.decode('UTF-8')
Review Comment:
The raw() method will fail if the original string data contained non-UTF-8
characters. Since the constructor uses 'latin-1' encoding in some contexts but
this method assumes UTF-8 for decoding, it could raise a UnicodeDecodeError.
```suggestion
return self.data.decode('UTF-8', errors='replace')
```
##########
python-client/pypegasus/pgclient.py:
##########
@@ -611,24 +620,43 @@ def generate_key(cls, hash_key, sort_key):
@classmethod
def generate_next_bytes(cls, buff):
- pos = len(buff) - 1
+ is_str = isinstance(buff, str)
+ is_ba = isinstance(buff, bytearray)
+
+ if is_str:
+ arr = bytearray(buff.encode('latin-1'))
Review Comment:
Using 'latin-1' encoding assumes the string contains only characters in the
0-255 range. Consider adding validation or using a more explicit encoding that
matches the expected data format.
##########
python-client/pypegasus/pgclient.py:
##########
@@ -611,24 +620,43 @@ def generate_key(cls, hash_key, sort_key):
@classmethod
def generate_next_bytes(cls, buff):
- pos = len(buff) - 1
+ is_str = isinstance(buff, str)
+ is_ba = isinstance(buff, bytearray)
+
+ if is_str:
+ arr = bytearray(buff.encode('latin-1'))
+ elif is_ba:
+ arr = buff
+ else:
+ arr = bytearray(buff)
+ pos = len(arr) - 1
found = False
while pos >= 0:
- if ord(buff[pos]) != 0xFF:
- buff[pos] += 1
+ if arr[pos] != 0xFF:
+ arr[pos] += 1
found = True
break
- if found:
- return buff
+ pos -= 1
+ if not found:
+ arr += b'\x00'
+ if is_str:
+ return arr.decode('latin-1')
Review Comment:
The decode operation uses 'latin-1' which should be consistent with the
encode operation, but this creates a dependency on the encoding choice that may
not be obvious to future maintainers. Consider adding a comment explaining the
encoding choice.
##########
python-client/pypegasus/pgclient.py:
##########
@@ -1004,6 +1032,20 @@ def get_scanner(self, hash_key,
stop_key, stop_inclusive = self.generate_stop_key(hash_key,
stop_sort_key)
if not stop_inclusive:
scan_options.stop_inclusive = stop_inclusive
+
+ # limit key range by prefix filter
+ if scan_options.sortkey_filter_type == filter_type.FT_MATCH_PREFIX and
\
+ len(scan_options.sortkey_filter_pattern) > 0:
+ prefix_start = self.generate_key(hash_key,
scan_options.sortkey_filter_pattern)
+ if bytes_cmp(prefix_start.data, start_key.data) > 0:
+ start_key = prefix_start
+ scan_options.start_inclusive = True
+
+ prefix_stop = self.generate_next_key(hash_key,
scan_options.sortkey_filter_pattern)
Review Comment:
The logic for comparing prefix boundaries could benefit from comments
explaining why these specific comparisons (> 0 and <= 0) are used, as the
boundary logic is not immediately obvious.
```suggestion
prefix_start = self.generate_key(hash_key,
scan_options.sortkey_filter_pattern)
# If the prefix start is after the current start_key, move the
scan start to the prefix.
# This ensures the scan only includes keys with the desired
prefix.
if bytes_cmp(prefix_start.data, start_key.data) > 0:
start_key = prefix_start
scan_options.start_inclusive = True
prefix_stop = self.generate_next_key(hash_key,
scan_options.sortkey_filter_pattern)
# If the prefix stop is before or equal to the current stop_key,
move the scan stop to the prefix stop.
# This ensures the scan does not go beyond the keys with the
desired prefix.
```
##########
python-client/pypegasus/pgclient.py:
##########
@@ -1004,6 +1032,20 @@ def get_scanner(self, hash_key,
stop_key, stop_inclusive = self.generate_stop_key(hash_key,
stop_sort_key)
if not stop_inclusive:
scan_options.stop_inclusive = stop_inclusive
+
+ # limit key range by prefix filter
+ if scan_options.sortkey_filter_type == filter_type.FT_MATCH_PREFIX and
\
+ len(scan_options.sortkey_filter_pattern) > 0:
+ prefix_start = self.generate_key(hash_key,
scan_options.sortkey_filter_pattern)
+ if bytes_cmp(prefix_start.data, start_key.data) > 0:
+ start_key = prefix_start
+ scan_options.start_inclusive = True
+
+ prefix_stop = self.generate_next_key(hash_key,
scan_options.sortkey_filter_pattern)
Review Comment:
The logic for comparing prefix boundaries could benefit from comments
explaining why these specific comparisons (> 0 and <= 0) are used, as the
boundary logic is not immediately obvious.
```suggestion
prefix_start = self.generate_key(hash_key,
scan_options.sortkey_filter_pattern)
# If the prefix start is after the current start_key, move the
scan start to the prefix.
# This ensures the scan only includes keys with the desired
prefix.
if bytes_cmp(prefix_start.data, start_key.data) > 0:
start_key = prefix_start
scan_options.start_inclusive = True
prefix_stop = self.generate_next_key(hash_key,
scan_options.sortkey_filter_pattern)
# If the prefix stop is before or equal to the current stop_key,
move the scan stop to the prefix stop.
# This ensures the scan does not go beyond the keys matching the
prefix.
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]