[
https://issues.apache.org/jira/browse/PARQUET-2006?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17517088#comment-17517088
]
ASF GitHub Bot commented on PARQUET-2006:
-----------------------------------------
rdblue commented on code in PR #950:
URL: https://github.com/apache/parquet-mr/pull/950#discussion_r842099171
##########
parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java:
##########
@@ -878,11 +880,97 @@ public String getFile() {
return blocks;
}
- public void setRequestedSchema(MessageType projection) {
+ private boolean uniqueId(GroupType schema, HashSet<Type.ID> ids) {
+ boolean unique = true;
+ List<Type> fields = schema.getFields();
+ for (Type field : fields) {
+ if (field instanceof PrimitiveType) {
+ Type.ID id = field.getId();
+ if (id != null) {
+ if (ids.contains(id)) {
+ return false;
+ }
+ ids.add(id);
+ }
+ }
+
+ if (field instanceof GroupType) {
+ Type.ID id = field.getId();
+ if (id != null) {
+ if (ids.contains(id)) {
+ return false;
+ }
+ ids.add(id);
+ }
+ if (unique) unique = uniqueId(field.asGroupType(), ids);
+ }
+ }
+ return unique;
+ }
+
+ public MessageType setRequestedSchema(MessageType projection, boolean
useColumnId) {
paths.clear();
- for (ColumnDescriptor col : projection.getColumns()) {
+ MessageType schema = null;
+ if (useColumnId) {
+ HashSet<Type.ID> ids = new HashSet<>();
+ boolean fileSchemaIdUnique = uniqueId(fileMetaData.getSchema(), ids);
+ if (!fileSchemaIdUnique) {
+ throw new RuntimeException("can't use column id resolution because
there are duplicate column ids.");
+ }
+ ids = new HashSet<>();
+ boolean projectionSchemaIdUnique = uniqueId(projection, ids);
+ if (!projectionSchemaIdUnique) {
+ throw new RuntimeException("can't use column id resolution because
there are duplicate column ids.");
+ }
+ schema = resetColumnNameBasedOnId(projection);
+ } else {
+ schema = projection;
+ }
+ for (ColumnDescriptor col : schema.getColumns()) {
paths.put(ColumnPath.get(col.getPath()), col);
}
+ return schema;
+ }
+
+ private MessageType resetColumnNameBasedOnId(MessageType schema) {
+ List<Type> fields = schema.getFields();
+ List<Type> resetFields = resetColumnNameInFields(fields);
+ return new MessageType(schema.getName(), resetFields);
+ }
+
+ private List<Type> resetColumnNameInFields(List<Type> fields) {
+ List<Type> resetFields = new ArrayList<>();
+ for (Type childField : fields) {
+ Type resetChildField = resetColumnNameInField(childField);
+ if (resetChildField != null) {
+ resetFields.add(resetChildField);
+ }
+ }
+ return resetFields;
+ }
+
+ private Type resetColumnNameInField(Type field) {
+ String fieldName = field.getName();
+ Type resetField = null;
+ if (field.isPrimitive()) {
+ Type.ID id = field.getId();
+ List<ColumnDescriptor> descriptors =
fileMetaData.getSchema().getColumns();
+ for (ColumnDescriptor c : descriptors) {
+ Type.ID idInFileMetaData = c.getPrimitiveType().getId();
+ if (idInFileMetaData != null && id != null &&
idInFileMetaData.intValue() == id.intValue()) {
+ fieldName = c.getPrimitiveType().getName();
+ }
+ }
+ resetField = new PrimitiveType(field.getRepetition(),
field.asPrimitiveType().getPrimitiveTypeName(), fieldName);
+ } else {
+ List<Type> childFields = ((GroupType) field).getFields();
+ List<Type> resetFields = resetColumnNameInFields(childFields);
+ if (resetFields.size() > 0) {
+ resetField = ((GroupType) field).withNewFields(resetFields);
Review Comment:
It doesn't look like this renames the field itself. So if you have a nested
field with a top-level name change this wouldn't work.
> Column resolution by ID
> -----------------------
>
> Key: PARQUET-2006
> URL: https://issues.apache.org/jira/browse/PARQUET-2006
> Project: Parquet
> Issue Type: New Feature
> Components: parquet-mr
> Reporter: Xinli Shang
> Assignee: Xinli Shang
> Priority: Major
>
> Parquet relies on the name. In a lot of usages e.g. schema resolution, this
> would be a problem. Iceberg uses ID and stored Id/name mappings.
> This Jira is to add column ID resolution support.
--
This message was sent by Atlassian Jira
(v8.20.1#820001)