yuchen-ecnu commented on code in PR #23820: URL: https://github.com/apache/flink/pull/23820#discussion_r1428757740
########## flink-runtime/src/test/java/org/apache/flink/runtime/util/profiler/ProfilingServiceTest.java: ########## @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.util.profiler; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.runtime.rest.messages.ProfilingInfo; +import org.apache.flink.util.StringUtils; + +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.io.TempDir; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayDeque; +import java.util.HashSet; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; + +/** Unit tests for {@link ProfilingService}. */ +public class ProfilingServiceTest { + + private static final Logger LOG = LoggerFactory.getLogger(ProfilingServiceTest.class); + private static final Configuration configs = new Configuration(); + private static final String NO_ACCESS_TO_PERF_EVENTS = "No access to perf events."; + private static final String NO_ALLOC_SYMBOL_FOUND = "No AllocTracer symbols found."; + private static final String resourceID = "TestJobManager"; + private static final long profilingDuration = 3L; + private static final int historySizeLimit = 3; + + private ProfilingService profilingService; + + @BeforeAll + static void beforeAll() { + configs.set(RestOptions.MAX_PROFILING_HISTORY_SIZE, historySizeLimit); + } + + @BeforeEach + void setUp(@TempDir Path tempDir) { + configs.set(RestOptions.PROFILING_RESULT_DIR, tempDir.toString()); + profilingService = ProfilingService.getInstance(configs); + verifyConfigsWorks(profilingService, tempDir); Review Comment: Since we initialized `ProfilingService` with `tempDir` created by junit here (generated in runtime, not accessible in `@Test`), so I checked the configuration of the profiling result directory works within the `setUp` process. ########## flink-runtime/src/test/java/org/apache/flink/runtime/util/profiler/ProfilingServiceTest.java: ########## @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.util.profiler; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.runtime.rest.messages.ProfilingInfo; +import org.apache.flink.util.StringUtils; + +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.io.TempDir; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayDeque; +import java.util.HashSet; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; + +/** Unit tests for {@link ProfilingService}. */ +public class ProfilingServiceTest { + + private static final Logger LOG = LoggerFactory.getLogger(ProfilingServiceTest.class); + private static final Configuration configs = new Configuration(); + private static final String NO_ACCESS_TO_PERF_EVENTS = "No access to perf events."; + private static final String NO_ALLOC_SYMBOL_FOUND = "No AllocTracer symbols found."; + private static final String resourceID = "TestJobManager"; + private static final long profilingDuration = 3L; + private static final int historySizeLimit = 3; + + private ProfilingService profilingService; + + @BeforeAll + static void beforeAll() { + configs.set(RestOptions.MAX_PROFILING_HISTORY_SIZE, historySizeLimit); + } + + @BeforeEach + void setUp(@TempDir Path tempDir) { + configs.set(RestOptions.PROFILING_RESULT_DIR, tempDir.toString()); + profilingService = ProfilingService.getInstance(configs); + verifyConfigsWorks(profilingService, tempDir); + } + + @AfterEach + void tearDown() throws IOException { + profilingService.close(); + } + + @Test + public void testSingleInstance() throws IOException { + ProfilingService instance = ProfilingService.getInstance(configs); + Assertions.assertEquals(profilingService, instance); + instance.close(); + } + + @Test + void testFailedRequestUnderProfiling() throws ExecutionException, InterruptedException { + ProfilingInfo profilingInfo = + profilingService + .requestProfiling(resourceID, 10, ProfilingInfo.ProfilingMode.ITIMER) + .get(); + Assertions.assertEquals(ProfilingInfo.ProfilingStatus.RUNNING, profilingInfo.getStatus()); + try { + profilingService + .requestProfiling( + resourceID, profilingDuration, ProfilingInfo.ProfilingMode.ITIMER) + .get(); + Assertions.fail("Duplicate profiling request should throw with IllegalStateException."); + } catch (Exception e) { + Assertions.assertTrue(e.getCause() instanceof IllegalStateException); + } + } + + @Test + @Timeout(value = 1, unit = TimeUnit.MINUTES) + public void testAllProfilingMode() throws ExecutionException, InterruptedException { + for (ProfilingInfo.ProfilingMode mode : ProfilingInfo.ProfilingMode.values()) { + ProfilingInfo profilingInfo = + profilingService.requestProfiling(resourceID, profilingDuration, mode).get(); + if (isNoPermissionOrAllocateSymbol(profilingInfo)) { + LOG.warn( + "Ignoring failed profiling instance in {} mode, which caused by no permission.", + profilingInfo.getProfilingMode()); + continue; + } + Assertions.assertEquals( + ProfilingInfo.ProfilingStatus.RUNNING, + profilingInfo.getStatus(), + String.format( + "Submitting profiling request should be succeed or no permission, but got errorMsg=%s", + profilingInfo.getMessage())); + waitForProfilingFinished(profilingService); + Assertions.assertEquals( + ProfilingInfo.ProfilingStatus.FINISHED, + profilingInfo.getStatus(), + String.format( + "Profiling request should complete successful, but got errorMsg=%s", + profilingInfo.getMessage())); + } + + verifyRollingDeletion(profilingService); + } + + private void verifyConfigsWorks(ProfilingService profilingService, Path configuredDir) { + Assertions.assertEquals(configuredDir.toString(), profilingService.getProfilingResultDir()); + Assertions.assertEquals(historySizeLimit, profilingService.getHistorySizeLimit()); + } + + private void verifyRollingDeletion(ProfilingService profilingService) { + ArrayDeque<ProfilingInfo> profilingList = + profilingService.getProfilingListForTest(resourceID); + // Profiling History shouldn't exceed history size limit. + Assertions.assertEquals(historySizeLimit, profilingList.size()); Review Comment: It's safe here because there are five profiling modes(only `CPU` and `ALLOCATION` are affected by permission or AllocTracer symbols) in the async-profiler. But it makes sense to replace them with `<=`. Corrected. ########## flink-dist/src/main/resources/META-INF/NOTICE: ########## @@ -20,6 +20,7 @@ This project bundles the following dependencies under the Apache Software Licens - org.lz4:lz4-java:1.8.0 - org.objenesis:objenesis:2.1 - org.xerial.snappy:snappy-java:1.1.10.4 +- tools.profiler:async-profiler:2.9 Review Comment: Squashed with the first commit. ########## flink-runtime/src/test/java/org/apache/flink/runtime/util/profiler/ProfilingServiceTest.java: ########## @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.util.profiler; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.RestOptions; +import org.apache.flink.runtime.rest.messages.ProfilingInfo; +import org.apache.flink.util.StringUtils; + +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.io.TempDir; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayDeque; +import java.util.HashSet; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; + +/** Unit tests for {@link ProfilingService}. */ +public class ProfilingServiceTest { + + private static final Logger LOG = LoggerFactory.getLogger(ProfilingServiceTest.class); + private static final Configuration configs = new Configuration(); + private static final String NO_ACCESS_TO_PERF_EVENTS = "No access to perf events."; + private static final String NO_ALLOC_SYMBOL_FOUND = "No AllocTracer symbols found."; + private static final String resourceID = "TestJobManager"; + private static final long profilingDuration = 3L; + private static final int historySizeLimit = 3; + + private ProfilingService profilingService; + + @BeforeAll + static void beforeAll() { + configs.set(RestOptions.MAX_PROFILING_HISTORY_SIZE, historySizeLimit); + } + + @BeforeEach + void setUp(@TempDir Path tempDir) { + configs.set(RestOptions.PROFILING_RESULT_DIR, tempDir.toString()); + profilingService = ProfilingService.getInstance(configs); + verifyConfigsWorks(profilingService, tempDir); + } + + @AfterEach + void tearDown() throws IOException { + profilingService.close(); + } + + @Test + public void testSingleInstance() throws IOException { + ProfilingService instance = ProfilingService.getInstance(configs); + Assertions.assertEquals(profilingService, instance); + instance.close(); + } + + @Test + void testFailedRequestUnderProfiling() throws ExecutionException, InterruptedException { + ProfilingInfo profilingInfo = + profilingService + .requestProfiling(resourceID, 10, ProfilingInfo.ProfilingMode.ITIMER) + .get(); + Assertions.assertEquals(ProfilingInfo.ProfilingStatus.RUNNING, profilingInfo.getStatus()); + try { + profilingService + .requestProfiling( + resourceID, profilingDuration, ProfilingInfo.ProfilingMode.ITIMER) + .get(); + Assertions.fail("Duplicate profiling request should throw with IllegalStateException."); + } catch (Exception e) { + Assertions.assertTrue(e.getCause() instanceof IllegalStateException); + } + } + + @Test + @Timeout(value = 1, unit = TimeUnit.MINUTES) + public void testAllProfilingMode() throws ExecutionException, InterruptedException { + for (ProfilingInfo.ProfilingMode mode : ProfilingInfo.ProfilingMode.values()) { + ProfilingInfo profilingInfo = + profilingService.requestProfiling(resourceID, profilingDuration, mode).get(); + if (isNoPermissionOrAllocateSymbol(profilingInfo)) { + LOG.warn( + "Ignoring failed profiling instance in {} mode, which caused by no permission.", + profilingInfo.getProfilingMode()); + continue; + } + Assertions.assertEquals( + ProfilingInfo.ProfilingStatus.RUNNING, + profilingInfo.getStatus(), + String.format( + "Submitting profiling request should be succeed or no permission, but got errorMsg=%s", + profilingInfo.getMessage())); + waitForProfilingFinished(profilingService); + Assertions.assertEquals( + ProfilingInfo.ProfilingStatus.FINISHED, + profilingInfo.getStatus(), + String.format( + "Profiling request should complete successful, but got errorMsg=%s", + profilingInfo.getMessage())); + } + + verifyRollingDeletion(profilingService); Review Comment: Since rolling deletion is a follow-up action after sampling, it only happens when the number of Profiling Results exceeds the parameter limit. So I think it's more suitable to check it in the profiling test, so as to avoid re-sampling several times to construct the rolling deletion case specifically, WDYT? ########## flink-runtime-web/web-dashboard/src/app/components/humanize-watermark.pipe.ts: ########## @@ -44,8 +44,8 @@ export class HumanizeWatermarkToDatetimePipe implements PipeTransform { constructor(private readonly configService: ConfigService) {} public transform(value: number): number | string { - if (isNaN(value) || value <= this.configService.LONG_MIN_VALUE) { - return '-'; + if (value == null || isNaN(value) || value <= this.configService.LONG_MIN_VALUE) { + return 'N/A'; Review Comment: We'll humanize the timestamps of `ProfilingStartTime` or `ProfilingFinishedTime` on the web. While `isNan(null) =false` when `value` is null, this caused the parse-out time to be wrong. So changes have been made in this PR to make this PipeTransform more generic. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
