[ https://issues.apache.org/jira/browse/HADOOP-19543?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17945322#comment-17945322 ]
ASF GitHub Bot commented on HADOOP-19543: ----------------------------------------- bhattmanish98 commented on code in PR #7614: URL: https://github.com/apache/hadoop/pull/7614#discussion_r2048648600 ########## hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/utils/TestListUtils.java: ########## @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.azurebfs.utils; + +import java.util.ArrayList; +import java.util.List; + +import org.assertj.core.api.Assertions; +import org.junit.Test; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; + +public class TestListUtils { + + @Test + public void testRemoveDuplicates() { + List<FileStatus> originalList = new ArrayList<>(); + validateList(originalList, 0); + + originalList = new ArrayList<>(); + originalList.add(getFileStatusObject(new Path("/A"))); + validateList(originalList, 1); + + originalList = new ArrayList<>(); + originalList.add(getFileStatusObject(new Path("/A"))); + originalList.add(getFileStatusObject(new Path("/A"))); + validateList(originalList, 1); + + originalList = new ArrayList<>(); + originalList.add(getFileStatusObject(new Path("/a"))); + originalList.add(getFileStatusObject(new Path("/a.bak1"))); + originalList.add(getFileStatusObject(new Path("/a.bak1.bak2"))); + originalList.add(getFileStatusObject(new Path("/a.bak1.bak2"))); + originalList.add(getFileStatusObject(new Path("/a.bak1"))); + originalList.add(getFileStatusObject(new Path("/a"))); + originalList.add(getFileStatusObject(new Path("/abc"))); + originalList.add(getFileStatusObject(new Path("/abc.bak1"))); + originalList.add(getFileStatusObject(new Path("/abc"))); + validateList(originalList, 5); + + originalList = new ArrayList<>(); + originalList.add(getFileStatusObject(new Path("/a"))); + originalList.add(getFileStatusObject(new Path("/a"))); + originalList.add(getFileStatusObject(new Path("/a_bak1"))); + originalList.add(getFileStatusObject(new Path("/a_bak1"))); + originalList.add(getFileStatusObject(new Path("/a_bak1_bak2"))); + originalList.add(getFileStatusObject(new Path("/a_bak1_bak2"))); + originalList.add(getFileStatusObject(new Path("/abc"))); + originalList.add(getFileStatusObject(new Path("/abc"))); + originalList.add(getFileStatusObject(new Path("/abc_bak1"))); + validateList(originalList, 5); + + originalList = new ArrayList<>(); + originalList.add(getFileStatusObject(new Path("/a"))); + originalList.add(getFileStatusObject(new Path("/b"))); + validateList(originalList, 2); + + originalList = new ArrayList<>(); + originalList.add(getFileStatusObject(new Path("/a"))); + originalList.add(getFileStatusObject(new Path("/b"))); + originalList.add(getFileStatusObject(new Path("/b"))); + validateList(originalList, 2); + } + + private void validateList(List<FileStatus> originalList, int expectedSize) { Review Comment: Java doc missing for this and below method as well. > ABFS: [FnsOverBlob] Remove Duplicates from Blob Endpoint Listing Across > Iterations > ---------------------------------------------------------------------------------- > > Key: HADOOP-19543 > URL: https://issues.apache.org/jira/browse/HADOOP-19543 > Project: Hadoop Common > Issue Type: Sub-task > Components: fs/azure > Affects Versions: 3.5.0, 3.4.1 > Reporter: Anuj Modi > Assignee: Anuj Modi > Priority: Blocker > Labels: pull-request-available > > On FNS-Blob, List Blobs API is known to return duplicate entries for the > non-empty explicit directories. One entry corresponds to the directory itself > and another entry corresponding to the marker blob that driver internally > creates and maintains to mark that path as a directory. We already know about > this behaviour and it was handled to remove such duplicate entries from the > set of entries that were returned as part current list iterations. > Due to possible partition split if such duplicate entries happen to be > returned in separate iteration, there is no handling on this and caller might > get back the result with duplicate entries as happening in this case. The > logic to remove duplicate was designed before the realization of partition > split came. > This PR fixes this bug -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-issues-h...@hadoop.apache.org