geserdugarov commented on code in PR #12120:
URL: https://github.com/apache/hudi/pull/12120#discussion_r1810148852
##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/KeyGenUtils.java:
##########
@@ -133,20 +134,87 @@ public static String[] extractRecordKeys(String
recordKey) {
}
public static String[] extractRecordKeysByFields(String recordKey,
List<String> fields) {
- String[] fieldKV = recordKey.split(DEFAULT_RECORD_KEY_PARTS_SEPARATOR);
- return Arrays.stream(fieldKV).map(kv ->
kv.split(DEFAULT_COMPOSITE_KEY_FILED_VALUE, 2))
- .filter(kvArray -> kvArray.length == 1 || fields.isEmpty() ||
(fields.contains(kvArray[0])))
- .map(kvArray -> {
- if (kvArray.length == 1) {
- return kvArray[0];
- } else if (kvArray[1].equals(NULL_RECORDKEY_PLACEHOLDER)) {
- return null;
- } else if (kvArray[1].equals(EMPTY_RECORDKEY_PLACEHOLDER)) {
- return "";
+ // if there is no ',' and ':', then it's a key value
+ if (!recordKey.contains(DEFAULT_RECORD_KEY_PARTS_SEPARATOR) ||
!recordKey.contains(DEFAULT_COLUMN_VALUE_SEPARATOR)) {
+ return new String[] {recordKey};
+ }
+ // complex key case
+ // Here we're reducing memory allocation for substrings and use index
positions,
+ // because for bucket index this will be called for each record, which
leads to GC overhead
+ int keyValueSep1;
+ int keyValueSep2;
+ int commaPosition;
+ String currentField;
+ String currentValue;
+ List<String> values = new ArrayList<>();
+ int processed = 0;
+ while (processed < recordKey.length()) {
+ // note that keyValueSeps and commaPosition are absolute
+ keyValueSep1 = recordKey.indexOf(DEFAULT_COLUMN_VALUE_SEPARATOR,
processed);
+ currentField = recordKey.substring(processed, keyValueSep1);
+ keyValueSep2 = recordKey.indexOf(DEFAULT_COLUMN_VALUE_SEPARATOR,
keyValueSep1 + 1);
+ if (fields.isEmpty() || fields.contains(currentField)) {
+ if (keyValueSep2 < 0) {
+ // there is no next key value pair
+ currentValue = recordKey.substring(keyValueSep1 + 1);
+ processed = recordKey.length();
+ } else {
+ // looking for ',' in reverse order to support multiple ',' in key
values by looking for the latest ','
+ commaPosition =
recordKey.lastIndexOf(DEFAULT_RECORD_KEY_PARTS_SEPARATOR, keyValueSep2);
+ // commaPosition could be -1 if didn't find ',', or we could find
',' from previous key-value pair ('col1:val1,...')
+ // also we could have the last value with ':', so need to check if
keyValueSep2 > 0
+ while (commaPosition < keyValueSep1 && keyValueSep2 > 0) {
+ // If we have key value as a timestamp with ':',
Review Comment:
Added support of `:` in complex key values, and described how it's work here.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]