kasakrisz commented on code in PR #6413:
URL: https://github.com/apache/hive/pull/6413#discussion_r3442593254
##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java:
##########
@@ -194,6 +201,9 @@ public Table makeCopy() {
newTab.setMetaTable(this.getMetaTable());
newTab.setSnapshotRef(this.getSnapshotRef());
+ if (this.tablePartCols != null) {
+ newTab.tablePartCols = new ArrayList<>(this.tablePartCols);
Review Comment:
should we make a deep copy? `FieldSchema` is mutable
##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java:
##########
@@ -682,7 +704,7 @@ public void setSkewedValueLocationMap(List<String> valList,
String dirName) {
public Map<List<String>, String> getSkewedColValueLocationMaps() {
return (tTable.getSd().getSkewedInfo() != null) ?
tTable.getSd().getSkewedInfo()
- .getSkewedColValueLocationMaps() : new HashMap<List<String>, String>();
+ .getSkewedColValueLocationMaps() : new HashMap<>();
Review Comment:
Is this a read-only map? if so `Collections.emptyMap()` can be used instead
of `new HashMap<>()`.
##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java:
##########
@@ -110,6 +113,10 @@ public class Table implements Serializable {
/**
* These fields are all cached fields. The information comes from tTable.
*/
+ private List<FieldSchema> tablePartCols;
+ private record TableColumn(int index, FieldSchema field) {}
+ private transient Map<String, TableColumn> columnsByName;
+ private transient List<FieldSchema> tableNonPartCols;
Review Comment:
`tableNonPartCols` is transient. Why is `tablePartCols` not?
##########
ql/src/java/org/apache/hadoop/hive/ql/parse/rewrite/MergeRewriter.java:
##########
@@ -238,20 +239,29 @@ public void
appendWhenMatchedUpdateClause(MergeStatement.UpdateClause updateClau
protected void addValues(Table targetTable, String targetAlias,
Map<String, String> newValues,
List<String> values) {
- UnaryOperator<String> formatter = name -> String.format("%s.%s",
targetAlias,
+ UnaryOperator<String> formatter = name -> String.format("%s.%s",
targetAlias,
HiveUtils.unparseIdentifier(name, conf));
-
+ List<String> valuesToBeAdded = new
ArrayList<>(Collections.nCopies(targetTable.getAllCols().size(), null));
for (FieldSchema fieldSchema : targetTable.getCols()) {
- if (newValues.containsKey(fieldSchema.getName())) {
- String rhsExp = newValues.get(fieldSchema.getName());
- values.add(getRhsExpValue(rhsExp,
formatter.apply(fieldSchema.getName())));
- } else {
- values.add(formatter.apply(fieldSchema.getName()));
- }
+ setColumnValue(targetTable, valuesToBeAdded, newValues, formatter,
fieldSchema.getName(), true);
}
-
- targetTable.getPartCols().forEach(fieldSchema -> values.add(
- formatter.apply(fieldSchema.getName())));
+
+ for (FieldSchema partCol : targetTable.getPartCols()) {
+ setColumnValue(targetTable, valuesToBeAdded, newValues, formatter,
partCol.getName(),
+ targetTable.hasNonNativePartitionSupport());
+ }
+ values.addAll(valuesToBeAdded);
+ }
+
+ protected void setColumnValue(Table targetTable, List<String>
valuesToBeAdded,
+ Map<String, String> newValues, UnaryOperator<String> formatter, String
columnName,
+ boolean applyNewValues) {
+ int index = targetTable.getColumnIndexByName(columnName);
+ String formattedColumn = formatter.apply(columnName);
+ String value = applyNewValues && newValues.containsKey(columnName)
+ ? getRhsExpValue(newValues.get(columnName), formattedColumn)
+ : formattedColumn;
+ valuesToBeAdded.set(index, value);
Review Comment:
Why is this refactor necessary? `MergeRewriter` is only used for native ACID
tables and in that case partition column values are can not be changed. In case
of non-native txn storage formats like Iceberg `SplitMergeRewriter` is used.
##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java:
##########
@@ -916,6 +987,9 @@ public void setDbName(String databaseName) {
}
public List<FieldSchema> getPartitionKeys() {
+ if (tTable.getPartitionKeys() == null) {
+ tTable.setPartitionKeys(new ArrayList<>());
Review Comment:
Can `Collections.emptyList()` be used here? Can the new List instance be
immutable?
##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java:
##########
@@ -728,8 +750,50 @@ private boolean isField(String col) {
return false;
}
+ private void ensureColumnsIndexed() {
+ if (columnsByName != null) {
+ return;
+ }
+ Map<String, TableColumn> indexedColumns = new HashMap<>();
+ List<FieldSchema> fsList = new ArrayList<>(getColsInternal(false));
+ if (!hasNonNativePartitionSupport() || isView()) {
+ fsList.addAll(getPartitionKeys());
+ }
+ for (int i = 0; i < fsList.size(); i++) {
+ indexedColumns.put(fsList.get(i).getName().toLowerCase(), new
TableColumn(i, fsList.get(i)));
+ }
+ columnsByName = indexedColumns;
+ }
+
+ public Integer getColumnIndexByName(String colName) {
+ ensureColumnsIndexed();
+ TableColumn column = columnsByName.get(colName.toLowerCase());
+ return column != null ? column.index() : null;
+ }
+
+ public FieldSchema getColumnByName(String colName) {
+ ensureColumnsIndexed();
+ TableColumn column = columnsByName.get(colName.toLowerCase());
+ return column != null ? column.field() : null;
+ }
+
public List<FieldSchema> getCols() {
- return getColsInternal(false);
+ if (tableNonPartCols != null) {
+ return tableNonPartCols;
+ }
+ if (!hasNonNativePartitionSupport()) {
+ tableNonPartCols = getColsInternal(false);
+ } else {
+ List<FieldSchema> nonPartFields = new ArrayList<>();
+ Set<String> partFieldsName =
getPartCols().stream().map(FieldSchema::getName).collect(Collectors.toSet());
+ for (FieldSchema field : getColsInternal(false)) {
+ if (!partFieldsName.contains(field.getName())) {
+ nonPartFields.add(field);
+ }
+ }
+ tableNonPartCols = nonPartFields;
Review Comment:
The new instance of `ArrayList` can be removed by:
```
List<FieldSchema> nonPartFields = getColsInternal(false);
List<FieldSchema> partCols = getPartCols();
Set<String> partColNames = partCols.stream()
.map(FieldSchema::getName)
.collect(Collectors.toSet());
tableNonPartCols = nonPartFields.stream()
.filter(field -> !partColNames.contains(field.getName()))
.collect(Collectors.toList());
```
##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java:
##########
@@ -728,8 +750,50 @@ private boolean isField(String col) {
return false;
}
+ private void ensureColumnsIndexed() {
+ if (columnsByName != null) {
+ return;
+ }
+ Map<String, TableColumn> indexedColumns = new HashMap<>();
Review Comment:
how about `Maps.newHashMapWithExpectedSize(fsList.size())`
##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java:
##########
@@ -595,26 +605,38 @@ public boolean equals(Object obj) {
&& Objects.equals(snapshotRef, other.snapshotRef);
}
+
+ /**
+ * Returns partition columns for native tables, or from the
+ * storage handler when {@link #hasNonNativePartitionSupport()}.
+ */
public List<FieldSchema> getPartCols() {
- List<FieldSchema> partKeys = tTable.getPartitionKeys();
- if (partKeys == null) {
- partKeys = new ArrayList<>();
- tTable.setPartitionKeys(partKeys);
+ if (tablePartCols != null) {
+ return tablePartCols;
}
- return partKeys;
+ if (hasNonNativePartitionSupport()) {
+ List<FieldSchema> partCols = getStorageHandler().getPartitionKeys(this);
+ for (FieldSchema partCol : partCols) {
+ FieldSchema storageSchemaCol = getColumnByName(partCol.getName());
+ if (storageSchemaCol != null && storageSchemaCol.getComment() != null)
{
+ partCol.setComment(storageSchemaCol.getComment());
+ }
+ }
Review Comment:
IIUC the `FieldSchema`s coming from `IcebergStorageHandler` do not contain
user comments, so we need to add them from HMS.
In cases where the Iceberg table comes from an external catalog outside of
HMS, we don't have these user comments.
Please add a comment to highlight this.
##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java:
##########
@@ -691,7 +713,7 @@ public void setSkewedColValues(List<List<String>>
skewedValues) {
public List<List<String>> getSkewedColValues(){
return (tTable.getSd().getSkewedInfo() != null) ?
tTable.getSd().getSkewedInfo()
- .getSkewedColValues() : new ArrayList<List<String>>();
+ .getSkewedColValues() : new ArrayList<>();
Review Comment:
Is this a read-only list? if so how about `Collections.emptyList()`?
##########
ql/src/java/org/apache/hadoop/hive/ql/parse/rewrite/SplitUpdateRewriter.java:
##########
@@ -134,4 +116,40 @@ public ParseUtils.ReparseResult rewrite(Context context,
UpdateStatement updateB
return rr;
}
+
+ private void appendUpdateColumn(UpdateStatement updateBlock,
+ MultiInsertSqlGenerator sqlGenerator, List<String> insertValues,
+ Map<Integer, ASTNode> setColExprs, String columnName, int
setColExprIndex, int columnOffset,
+ boolean appendToSelect, boolean prependComma) {
+ String identifier = HiveUtils.unparseIdentifier(columnName, conf);
+
+ // The insert value is placed for every column (data and partition).
+ int index = updateBlock.getTargetTable().getColumnIndexByName(columnName);
+ insertValues.set(index, sqlGenerator.qualify(identifier));
+
+ // Native tables: partition cols are already in the subquery SELECT
(appendAcidSelectColumns).
+ // Non-native tables: appendAcidSelectColumns exposes only delete-prefixed
aliases, so add plain names below (appendToSelect).
+ if (!appendToSelect) {
+ return;
+ }
Review Comment:
AFAIK currently we don't support updating partition column values and it is
out of the scope of this patch.
Anyway I think this is the point where we should create a new subclass of
`SplitUpdateRewriter` like `NonNativeSplitUpdateRewriter` because from this
point the rest of this method body is ignored and this `appendToSelect` depends
on a constant
https://github.com/apache/hive/blob/1fd716117e214cffa04e48c76fc3cb6b3e3723ac/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java#L381-L384
##########
ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java:
##########
@@ -700,7 +722,7 @@ public void setSkewedColNames(List<String> skewedColNames) {
public List<String> getSkewedColNames() {
return (tTable.getSd().getSkewedInfo() != null) ?
tTable.getSd().getSkewedInfo()
- .getSkewedColNames() : new ArrayList<String>();
+ .getSkewedColNames() : new ArrayList<>();
Review Comment:
Is this a read-only list? if so how about `Collections.emptyList()`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]