This is an automated email from the ASF dual-hosted git repository. pingtimeout pushed a commit to branch benchmarks-ppc in repository https://gitbox.apache.org/repos/asf/polaris-tools.git
commit 6313c7f08c66eecbbc0caf0c810983008e7ba970 Author: Pierre Laporte <pie...@pingtimeout.fr> AuthorDate: Thu May 22 17:37:52 2025 +0200 Add Search (v1/v2) operations against Dremio in the mixed workload --- benchmarks/README.md | 13 +++- .../src/gatling/resources/benchmark-defaults.conf | 15 +++- .../benchmarks/actions/ppc/SearchActions.scala | 79 ++++++++++++++++++++++ .../benchmarks/parameters/BenchmarkConfig.scala | 6 +- .../ReadUpdateTreeDatasetParameters.scala | 13 ++-- ...set.scala => ReadUpdateSearchTreeDataset.scala} | 48 +++++++++---- .../simulations/ReadUpdateTreeDataset.scala | 5 +- 7 files changed, 154 insertions(+), 25 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index e681939..ea6723a 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -25,7 +25,7 @@ Benchmarks for the Polaris service using Gatling. - `org.apache.polaris.benchmarks.simulations.CreateTreeDataset`: Creates a test dataset with a specific structure. It is a write-only workload designed to populate the system for subsequent benchmarks. - `org.apache.polaris.benchmarks.simulations.ReadTreeDataset`: Performs read-only operations to fetch namespaces, tables, and views. Some attributes of the objects are also fetched. This benchmark is intended to be used against a Polaris instance with a pre-existing tree dataset. It has no side effects on the dataset and can be executed multiple times without any issues. -- `org.apache.polaris.benchmarks.simulations.ReadUpdateTreeDataset`: Performs read and update operations against a Polaris instance populated with a test dataset. It is a read/write workload that can be used to test the system's ability to handle concurrent read and update operations. It is not destructive and does not prevent subsequent executions of `ReadTreeDataset` or `ReadUpdateTreeDataset`. +- `org.apache.polaris.benchmarks.simulations.ReadUpdateTreeDataset`: Performs read and write operations against a Dremio PPC instance populated with a test dataset, as well as search operations against a Dremio instance populated with Wiki content. The ratio of reads to writes and the search throughput are configurable. It is not destructive and does not prevent subsequent executions of `ReadTreeDataset` or `ReadUpdateTreeDataset`. - `org.apache.polaris.benchmarks.simulations.CreateCommits`: Creates table and view commits at configurable rates. This benchmark is useful for testing the system's ability to handle table and view commits and can be used to generate a history of thousands of commits for both tables and views. - `org.apache.polaris.benchmarks.simulations.CreateTreeWiki`: Creates wiki content and tags for entities previously created by `CreateTreeDataset`. This simulation is specific to Dremio PPC and enables the creation of more elaborate data products. The wiki content and tags are generated deterministically, ensuring consistent documentation across multiple runs with the same configuration. @@ -76,7 +76,11 @@ Workload settings are configured under `workload`: ```hocon workload { read-update-tree-dataset { - read-write-ratio = 0.8 # Ratio of reads (0.0-1.0) + read-write-ratio = 0.8 # Ratio of reads to writes (0.0-1.0) + read-write-throughput = 100 # Number of read/write operations per second + search-throughput = 20 # Number of search operations per second + duration-in-minutes = 5 # Duration of the simulation in minutes + search-version = 2 # Version of Dremio Search to use (1 or 2) } create-tree-wiki { @@ -106,7 +110,10 @@ http { workload { read-update-tree-dataset { - read-write-ratio = 0.8 # Ratio of reads (0.0-1.0) + read-write-ratio = 0.8 # 80% reads, 20% writes + read-write-throughput = 100 # 100 read/write operations per second + search-throughput = 20 # 20 search operations per second + duration-in-minutes = 5 # Run for 5 minutes } } ``` diff --git a/benchmarks/src/gatling/resources/benchmark-defaults.conf b/benchmarks/src/gatling/resources/benchmark-defaults.conf index c8712cc..715c19e 100644 --- a/benchmarks/src/gatling/resources/benchmark-defaults.conf +++ b/benchmarks/src/gatling/resources/benchmark-defaults.conf @@ -173,13 +173,24 @@ workload { # Default: 0.5 read-write-ratio = 0.5 - # Number of operations to perform per second + # Number of read/write operations to perform per second # Default: 100 - throughput = 100 + read-write-throughput = 100 + + # Number of search operations to perform per second + # Default: 1 + search-throughput = 1 # Duration of the simulation in minutes # Default: 5 duration-in-minutes = 5 + + # Version of the Dremio Search component to use + # Allowed values: 1 or 2 + # - 1: Uses the legacy Search + # - 2: Uses Search v2 + # Default: 2 + search-version = 2 } # Configuration for the CreateTreeWiki simulation diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/ppc/SearchActions.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/ppc/SearchActions.scala new file mode 100644 index 0000000..bb0a2a1 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/ppc/SearchActions.scala @@ -0,0 +1,79 @@ +package org.apache.polaris.benchmarks.actions.ppc + +import com.github.javafaker.Faker +import io.gatling.core.Predef._ +import io.gatling.core.feeder.Feeder +import io.gatling.core.structure.ChainBuilder +import io.gatling.http.Predef._ +import org.apache.polaris.benchmarks.parameters.{DatasetParameters, WorkloadParameters} +import play.api.libs.json.Json + +import java.util.concurrent.atomic.AtomicReference + +/** + * Actions for performance testing search operations. This class provides methods to search for + * entities in the catalog using the Search API. + */ +case class SearchActions( + faker: Faker, + wp: WorkloadParameters, + accessToken: AtomicReference[String] +) { + private val logger = org.slf4j.LoggerFactory.getLogger(getClass) + + def folderSearchFeeder(): Feeder[Any] = Iterator.continually { + Map( + "query" -> faker.dog().name(), + "category" -> "FOLDER" + ) + } + + def tableSearchFeeder(): Feeder[Any] = Iterator.continually { + Map( + "query" -> faker.dog().name(), + "category" -> "TABLE" + ) + } + + def viewSearchFeeder(): Feeder[Any] = Iterator.continually { + Map( + "query" -> faker.dog().name(), + "category" -> "VIEW" + ) + } + + /** + * Executes a search query using the v1 API (GET request, uses legacy Search) + */ + val executeSearchV1: ChainBuilder = exec( + http("Execute Search Query V1") + .get("/apiv2/datasets/search/?filter=#{query}") + .header("Authorization", "Bearer #{accessToken}") + .check(status.is(200)) + ) + + /** + * Executes a search query using the v2 API (POST request with filters, uses OpenSearch) + */ + val executeSearchV2: ChainBuilder = exec( + http("Execute Search Query V2") + .post("/api/v3/search") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .body( + StringBody( + """{ + | "query": "#{query}", + | "filter": "category in [\"#{category}\"]" + |}""".stripMargin + ) + ) + .check(status.is(200)) + ) + + val executeSearch: ChainBuilder = if (wp.readUpdateTreeDataset.searchVersion == 1) { + executeSearchV1 + } else { + executeSearchV2 + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/BenchmarkConfig.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/BenchmarkConfig.scala index 228383e..304ec2d 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/BenchmarkConfig.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/BenchmarkConfig.scala @@ -62,8 +62,10 @@ object BenchmarkConfig { ), ReadUpdateTreeDatasetParameters( rutdConfig.getDouble("read-write-ratio"), - rutdConfig.getInt("throughput"), - rutdConfig.getInt("duration-in-minutes") + rutdConfig.getInt("read-write-throughput"), + rutdConfig.getInt("search-throughput"), + rutdConfig.getInt("duration-in-minutes"), + rutdConfig.getInt("search-version") ), CreateTreeWikiParameters( ctwConfig.getInt("namespace-concurrency"), diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/ReadUpdateTreeDatasetParameters.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/ReadUpdateTreeDatasetParameters.scala index 40d4c7d..b93632d 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/ReadUpdateTreeDatasetParameters.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/ReadUpdateTreeDatasetParameters.scala @@ -23,20 +23,25 @@ package org.apache.polaris.benchmarks.parameters * Case class to hold the parameters for the ReadUpdateTreeDataset simulation. * * @param readWriteRatio The ratio of read operations to write operations (0.0-1.0). - * @param throughput The number of operations to perform per second. + * @param readWriteThroughput The number of read and write operations to perform per second. + * @param searchThroughput The number of search operations to perform per second. * @param durationInMinutes The duration of the simulation in minutes. */ case class ReadUpdateTreeDatasetParameters( readWriteRatio: Double, - throughput: Int, - durationInMinutes: Int + readWriteThroughput: Int, + searchThroughput: Int, + durationInMinutes: Int, + searchVersion: Int ) { require( readWriteRatio >= 0.0 && readWriteRatio <= 1.0, "Read/write ratio must be between 0.0 and 1.0 inclusive" ) - require(throughput >= 0, "Throughput cannot be negative") + require(readWriteThroughput >= 0, "Read/write throughput cannot be negative") + require(searchThroughput >= 0, "Search throughput cannot be negative") require(durationInMinutes > 0, "Duration in minutes must be positive") + require(searchVersion == 1 || searchVersion == 2, "Search version must be either 1 or 2") val gatlingReadRatio: Double = readWriteRatio * 100 val gatlingWriteRatio: Double = (1 - readWriteRatio) * 100 diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/ReadUpdateTreeDataset.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/ReadUpdateSearchTreeDataset.scala similarity index 79% copy from benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/ReadUpdateTreeDataset.scala copy to benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/ReadUpdateSearchTreeDataset.scala index 13b373e..9f3d6e6 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/ReadUpdateTreeDataset.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/ReadUpdateSearchTreeDataset.scala @@ -19,10 +19,12 @@ package org.apache.polaris.benchmarks.simulations +import com.github.javafaker.Faker import io.gatling.core.Predef._ import io.gatling.core.structure.ScenarioBuilder import io.gatling.http.Predef._ import org.apache.polaris.benchmarks.actions._ +import org.apache.polaris.benchmarks.actions.ppc.SearchActions import org.apache.polaris.benchmarks.parameters.BenchmarkConfig.config import org.apache.polaris.benchmarks.parameters.{ ConnectionParameters, @@ -33,15 +35,15 @@ import org.apache.polaris.benchmarks.util.CircularIterator import org.slf4j.LoggerFactory import java.util.concurrent.atomic.{AtomicBoolean, AtomicReference} +import java.util.{Locale, Random} import scala.concurrent.duration._ /** - * This simulation tests read and update operations on an existing dataset. - * - * The ratio of read operations to write operations is controlled by the readWriteRatio parameter in - * the ReadUpdateTreeDatasetParameters. + * This simulation tests read and update operations on an existing dataset, by sending queries to + * Dremio PPC directly. It also tests search operations by sending queries to Dremio Search using + * terms generated from the wiki content. */ -class ReadUpdateTreeDataset extends Simulation { +class ReadUpdateSearchTreeDataset extends Simulation { private val logger = LoggerFactory.getLogger(getClass) // -------------------------------------------------------------------------------- @@ -57,10 +59,12 @@ class ReadUpdateTreeDataset extends Simulation { private val accessToken: AtomicReference[String] = new AtomicReference() private val shouldRefreshToken: AtomicBoolean = new AtomicBoolean(true) + private val faker = new Faker(Locale.ENGLISH, new Random(0)) private val authActions = AuthenticationActions(cp, accessToken) private val nsActions = NamespaceActions(dp, wp, accessToken) private val tblActions = TableActions(dp, wp, accessToken) private val viewActions = ViewActions(dp, wp, accessToken) + private val searchActions = SearchActions(faker, wp, accessToken) private val nsListFeeder = new CircularIterator(nsActions.namespaceIdentityFeeder) private val nsExistsFeeder = new CircularIterator(nsActions.namespaceIdentityFeeder) @@ -77,6 +81,11 @@ class ReadUpdateTreeDataset extends Simulation { private val viewFetchFeeder = new CircularIterator(viewActions.viewFetchFeeder) private val viewUpdateFeeder = viewActions.propertyUpdateFeeder() + // Search feeders + private val folderSearchFeeder = searchActions.folderSearchFeeder() + private val tableSearchFeeder = searchActions.tableSearchFeeder() + private val viewSearchFeeder = searchActions.viewSearchFeeder() + // -------------------------------------------------------------------------------- // Authentication related workloads: // * Authenticate and store the access token for later use every minute @@ -105,7 +114,9 @@ class ReadUpdateTreeDataset extends Simulation { } // -------------------------------------------------------------------------------- - // Workload: Randomly read and write entities + // Workloads: + // * Randomly read and write entities using the configured ratio + // * Search entities using a 33%/33%/34% ratio between namespaces, tables, and views // -------------------------------------------------------------------------------- val readWriteScenario: ScenarioBuilder = scenario("Read and write entities using the Iceberg REST API") @@ -133,6 +144,17 @@ class ReadUpdateTreeDataset extends Simulation { ) ) + val searchScenario: ScenarioBuilder = + scenario("Search entities") + .group("Search")( + exec(authActions.restoreAccessTokenInSession) + .randomSwitch( + 33.0 -> exec(feed(folderSearchFeeder).exec(searchActions.executeSearch)), + 33.0 -> exec(feed(tableSearchFeeder).exec(searchActions.executeSearch)), + 34.0 -> exec(feed(viewSearchFeeder).exec(searchActions.executeSearch)) + ) + ) + // -------------------------------------------------------------------------------- // Build up the HTTP protocol configuration and set up the simulation // -------------------------------------------------------------------------------- @@ -147,8 +169,9 @@ class ReadUpdateTreeDataset extends Simulation { .disableCaching // Get the configured throughput and duration - private val throughput = wp.readUpdateTreeDataset.throughput - private val durationInMinutes = wp.readUpdateTreeDataset.durationInMinutes + private val rwThroughput = wp.readUpdateTreeDataset.readWriteThroughput + private val searchThroughput = wp.readUpdateTreeDataset.searchThroughput + private val duration = wp.readUpdateTreeDataset.durationInMinutes setUp( continuouslyRefreshOauthToken.inject(atOnceUsers(1)).protocols(dremioHttpProtocol), @@ -156,10 +179,11 @@ class ReadUpdateTreeDataset extends Simulation { .inject(atOnceUsers(1)) .andThen( readWriteScenario - .inject( - constantUsersPerSec(throughput).during(durationInMinutes.minutes).randomized - ) - .protocols(polarisHttpProtocol) + .inject(constantUsersPerSec(rwThroughput).during(duration.minutes)) + .protocols(polarisHttpProtocol), + searchScenario + .inject(constantUsersPerSec(searchThroughput).during(duration.minutes)) + .protocols(dremioHttpProtocol) ) .andThen(stopRefreshingToken.inject(atOnceUsers(1)).protocols(dremioHttpProtocol)) ) diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/ReadUpdateTreeDataset.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/ReadUpdateTreeDataset.scala index 13b373e..d1963e3 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/ReadUpdateTreeDataset.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/ReadUpdateTreeDataset.scala @@ -147,7 +147,8 @@ class ReadUpdateTreeDataset extends Simulation { .disableCaching // Get the configured throughput and duration - private val throughput = wp.readUpdateTreeDataset.throughput + private val readWriteThroughput = wp.readUpdateTreeDataset.readWriteThroughput + private val searchThroughput = wp.readUpdateTreeDataset.searchThroughput private val durationInMinutes = wp.readUpdateTreeDataset.durationInMinutes setUp( @@ -157,7 +158,7 @@ class ReadUpdateTreeDataset extends Simulation { .andThen( readWriteScenario .inject( - constantUsersPerSec(throughput).during(durationInMinutes.minutes).randomized + constantUsersPerSec(readWriteThroughput).during(durationInMinutes.minutes).randomized ) .protocols(polarisHttpProtocol) )