This is an automated email from the ASF dual-hosted git repository. pingtimeout pushed a commit to branch benchmarks-ppc in repository https://gitbox.apache.org/repos/asf/polaris-tools.git
commit cdda2fbdc9c304e5d880f3604b04b4510e2156ef Author: Pierre Laporte <pie...@pingtimeout.fr> AuthorDate: Mon May 12 16:35:21 2025 +0200 Add Wiki and Tags to PPC entities using the CreateTreeWiki simulation --- benchmarks/README.md | 121 ++++++++++ benchmarks/build.gradle.kts | 1 + .../src/gatling/resources/benchmark-defaults.conf | 19 ++ .../polaris/benchmarks/NAryTreeBuilder.scala | 3 +- .../benchmarks/actions/NamespaceActions.scala | 5 +- .../polaris/benchmarks/actions/TableActions.scala | 4 +- .../polaris/benchmarks/actions/ViewActions.scala | 3 + .../benchmarks/actions/ppc/WikiActions.scala | 266 +++++++++++++++++++++ .../benchmarks/parameters/BenchmarkConfig.scala | 9 +- ...meters.scala => CreateTreeWikiParameters.scala} | 22 +- .../benchmarks/parameters/DatasetParameters.scala | 3 +- .../benchmarks/parameters/WorkloadParameters.scala | 3 +- .../benchmarks/simulations/CreateTreeWiki.scala | 166 +++++++++++++ .../polaris/benchmarks/wiki/WikiBuilder.scala | 179 ++++++++++++++ .../wiki/content/BusinessContextGenerator.scala | 40 ++++ .../wiki/content/ChangeHistoryGenerator.scala | 76 ++++++ .../benchmarks/wiki/content/ColumnGenerator.scala | 153 ++++++++++++ .../wiki/content/ContactInformationGenerator.scala | 43 ++++ .../wiki/content/ContainedElementsGenerator.scala | 32 +++ .../wiki/content/DataQualityGenerator.scala | 35 +++ .../benchmarks/wiki/content/LineageGenerator.scala | 53 ++++ .../wiki/content/NamespaceGenerator.scala | 27 +++ .../content/SecurityAndPermissionsGenerator.scala | 57 +++++ .../wiki/content/SqlQueriesGenerator.scala | 121 ++++++++++ .../benchmarks/wiki/content/TableGenerator.scala | 57 +++++ .../benchmarks/wiki/content/TagGenerator.scala | 23 ++ .../wiki/content/TechnicalDetailsGenerator.scala | 19 ++ .../benchmarks/wiki/content/ViewGenerator.scala | 56 +++++ .../benchmarks/wiki/entities/BusinessContext.scala | 18 ++ .../benchmarks/wiki/entities/ChangeHistory.scala | 12 + .../polaris/benchmarks/wiki/entities/Column.scala | 16 ++ .../wiki/entities/ContactInformation.scala | 12 + .../benchmarks/wiki/entities/DataQuality.scala | 12 + .../entities/EntityType.scala} | 20 +- .../polaris/benchmarks/wiki/entities/Lineage.scala | 12 + .../benchmarks/wiki/entities/Namespace.scala | 28 +++ .../benchmarks/wiki/entities/Searchable.scala | 76 ++++++ .../wiki/entities/SecurityAndPermissions.scala | 14 ++ .../polaris/benchmarks/wiki/entities/Table.scala | 38 +++ .../wiki/entities/TechnicalDetails.scala | 14 ++ .../polaris/benchmarks/wiki/entities/View.scala | 39 +++ 41 files changed, 1885 insertions(+), 22 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 66df1cf..e681939 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -27,6 +27,7 @@ Benchmarks for the Polaris service using Gatling. - `org.apache.polaris.benchmarks.simulations.ReadTreeDataset`: Performs read-only operations to fetch namespaces, tables, and views. Some attributes of the objects are also fetched. This benchmark is intended to be used against a Polaris instance with a pre-existing tree dataset. It has no side effects on the dataset and can be executed multiple times without any issues. - `org.apache.polaris.benchmarks.simulations.ReadUpdateTreeDataset`: Performs read and update operations against a Polaris instance populated with a test dataset. It is a read/write workload that can be used to test the system's ability to handle concurrent read and update operations. It is not destructive and does not prevent subsequent executions of `ReadTreeDataset` or `ReadUpdateTreeDataset`. - `org.apache.polaris.benchmarks.simulations.CreateCommits`: Creates table and view commits at configurable rates. This benchmark is useful for testing the system's ability to handle table and view commits and can be used to generate a history of thousands of commits for both tables and views. +- `org.apache.polaris.benchmarks.simulations.CreateTreeWiki`: Creates wiki content and tags for entities previously created by `CreateTreeDataset`. This simulation is specific to Dremio PPC and enables the creation of more elaborate data products. The wiki content and tags are generated deterministically, ensuring consistent documentation across multiple runs with the same configuration. ## Parameters @@ -77,6 +78,12 @@ workload { read-update-tree-dataset { read-write-ratio = 0.8 # Ratio of reads (0.0-1.0) } + + create-tree-wiki { + namespace-concurrency = 50 # Number of namespace operations to perform simultaneously + table-concurrency = 50 # Number of table operations to perform simultaneously + view-concurrency = 50 # Number of view operations to perform simultaneously + } } ``` @@ -122,6 +129,10 @@ Run benchmarks with your configuration: # Commits creation ./gradlew gatlingRun --simulation org.apache.polaris.benchmarks.simulations.CreateCommits \ -Dconfig.file=./application.conf + +# Wiki and tags creation +./gradlew gatlingRun --simulation org.apache.polaris.benchmarks.simulations.CreateTreeWiki \ + -Dconfig.file=./application.conf ``` A message will show the location of the Gatling report: @@ -238,3 +249,113 @@ The diagram below shows sample catalog, namespace and table definition given the - Number of table properties: `59`  + +## Wiki Content and Tags + +The `CreateTreeWiki` simulation enhances the dataset by adding wiki content and tags to entities. The wiki content is generated deterministically using a fixed random seed, ensuring consistent documentation across multiple runs with the same configuration. Different entity types receive different sets of wiki attributes: + +- **Namespaces** receive: business context, change history, and contact information +- **Tables and Views** receive: business context, technical details, data quality information, security permissions, lineage tracking, SQL examples, change history, contact information, and tags + +### Example Wiki Content + +Below is a sample wiki content that might be generated for a table: + +``` +# Wiki for the table: T_0 + +## Entity Overview +- **Entity Name:** T_0 +- **Entity Type:** TABLE +- **Description:** This table contains Document data for Cabbages and Kings reader statistics, Weihenstephaner Hefeweissbier sales data, walrus population data. + +--- + +## Business Context +- **Owner:** Biodex Data Engineering Team +- **Use Cases:** redefine collaborative schemas. +- **Stakeholders:** + - Magnemite: Responsible for the quality and accuracy of the Streamlined intermediate middleware +- **Data Product Relation:** Part of the Ares data product. This data product provides a holistic view of Management Consulting operations. + +--- + +## Technical Details +- **Source Location:** `/source/gcs-ppc/dataset/NS_0/NS_1/NS_3/NS_7/NS_15/NS_31/NS_63/NS_127/NS_255/NS_511/T_0` +- **Update Frequency:** 2 times a day (ingested at least once at 00:46 AM UTC) +- **Retention Policy:** Data retained for 6 years +- **Schema:** + | Column Name | Data Type | Description | + |----------------------------------------|----------------------------------------|--------------------------------------------------------------------------------| + | column1 | CLOB | Unique identifier for this Payment | + | column2 | NCLOB | Unique identifier for this Shipment | + | column3 | INET | Unique identifier for this Review | + | column4 | CIDR | Unique identifier for this Rating | + | column5 | MACADDR | Unique identifier for this Comment | + +--- + +## Data Quality +- **Known Issues:** + - Mike Czech subset of the data is missing. + - Brice Tagg subset of the data is duplicated. +- **Metrics:** + - Average daily transaction count: 729 + - Monthly data growth: 6.36 GB +- **Validation Rules:** + - column1: must be located in a valid US country + - column2: must be located in a valid APAC country + - column2: must be located in a valid EMEA country + +--- + +## Security & Permissions +- **Access Controls:** + - Read Access: Operations Team. + - Write Access: Data Engineering Team. +- **Data Sensitivity:** Contains no PII, suitable for general access + +--- + +## Lineage +- **Upstream Sources:** + - `NS_7`: Feeds product metadata into Enormous Cotton Bottle + - `NS_3`: Supplies transaction details for Durable Aluminum Bag + - `NS_0`: Source of customer information for Incredible Concrete Shoes + - `NS_255`: Contains reference data used by Enormous Granite Plate + - `NS_1`: Contains reference data used by Mediocre Silk Keyboard + - `NS_63`: Provides raw data for Mediocre Rubber Bag + - `NS_511`: Supplies transaction details for Durable Wooden Watch + - `NS_127`: Source of customer information for Enormous Bronze Pants + - `NS_31`: Source of metrics used in Awesome Leather Plate + - `NS_15`: Provides dimension data for Ergonomic Steel Knife +- **Downstream Consumers:** + +--- + +## Change History +- **Last Updated:** 2025-05-09 +- **Changelog:** + - 2025-05-09: Updated description for column `column37` to enable better reporting. Requested by Sid Down + - 2025-05-01: Optimized indexing for column `column38` to fix a bug in data processing. Requested by Abbie Birthday + - 2025-04-23: Added foreign key reference to column `column39` to support integration with new system. Requested by Sid Down + - 2025-04-15: Changed nullability of column `column40` to improve data consistency. Requested by Russell Sprout + - 2025-04-07: Added column `column41` to improve query performance. Requested by Val Veeta + - 2025-03-30: Modified column `column42` to support new business requirements. Requested by Eve Hill + - 2025-03-22: Renamed column `column43` to fix data quality issues. Requested by Eve Ning + +--- + +## Contact Information +- **Primary Contact:** Sally Mander <sally.man...@yahoo.com> +- **Support Channels:** + - Slack: #biodex-data-engineering-team-t_0 +``` + +### Example Tags + +Tags are randomly generated and might look like: +- `Magnificent-Vision-Thermal-the-Fated-Sammy` +- `Agent-Entropy-Projection-Machine-Chico` + +These tags can be used for searching, filtering, and organizing the data catalog. diff --git a/benchmarks/build.gradle.kts b/benchmarks/build.gradle.kts index 3f68169..7850b2e 100644 --- a/benchmarks/build.gradle.kts +++ b/benchmarks/build.gradle.kts @@ -15,6 +15,7 @@ tasks.withType<ScalaCompile> { dependencies { gatling("com.typesafe.play:play-json_2.13:2.9.4") gatling("com.typesafe:config:1.4.3") + gatling("com.github.javafaker:javafaker:1.0.2") } repositories { diff --git a/benchmarks/src/gatling/resources/benchmark-defaults.conf b/benchmarks/src/gatling/resources/benchmark-defaults.conf index e90d3ef..c8712cc 100644 --- a/benchmarks/src/gatling/resources/benchmark-defaults.conf +++ b/benchmarks/src/gatling/resources/benchmark-defaults.conf @@ -109,6 +109,10 @@ dataset.tree { # Number of properties to add to each view # Default: 10 view-properties = 10 + + # PPC source name that is used to store the dataset + # Required: Must be provided in configuration + ppc-source-name = null } # Workload configuration @@ -177,4 +181,19 @@ workload { # Default: 5 duration-in-minutes = 5 } + + # Configuration for the CreateTreeWiki simulation + create-tree-wiki { + # Number of namespace operations to perform simultaneously + # Default: 50 + namespace-concurrency = 50 + + # Number of table operations to perform simultaneously + # Default: 50 + table-concurrency = 50 + + # Number of view operations to perform simultaneously + # Default: 50 + view-concurrency = 50 + } } diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/NAryTreeBuilder.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/NAryTreeBuilder.scala index 5be05bc..582b5de 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/NAryTreeBuilder.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/NAryTreeBuilder.scala @@ -53,7 +53,7 @@ case class NAryTreeBuilder(nsWidth: Int, nsDepth: Int) { * @param ordinal the ordinal of the parent node * @return a list of ordinals representing the child nodes */ - def childrenOf(ordinal: Int): List[Int] = { + def childrenOf(ordinal: Int): List[Int] = if (depthOf(ordinal) >= nsDepth - 1) { // Node is a leaf, has no children List.empty @@ -62,7 +62,6 @@ case class NAryTreeBuilder(nsWidth: Int, nsDepth: Int) { val firstChild = ordinal * nsWidth + 1 (firstChild until firstChild + nsWidth).toList } - } /** * Calculates the depth of a node in the n-ary tree based on its ordinal. diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/NamespaceActions.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/NamespaceActions.scala index 17b265a..d0aa66f 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/NamespaceActions.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/NamespaceActions.scala @@ -61,14 +61,13 @@ case class NamespaceActions( */ def namespaceIdentityFeeder(): Feeder[Any] = Iterator .from(0) - .map { tableId => - val namespaceId = tableId + .map { namespaceId => val namespacePath: Seq[String] = dp.nAryTree .pathToRoot(namespaceId) .map(ordinal => s"NS_$ordinal") Map( "catalogName" -> "default", - "namespaceId" -> tableId, + "namespaceId" -> namespaceId, "namespacePath" -> namespacePath, "namespaceJsonPath" -> Json.toJson(namespacePath).toString(), "namespaceMultipartPath" -> namespacePath.mkString("%1F") diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/TableActions.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/TableActions.scala index cccd53f..e447de3 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/TableActions.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/TableActions.scala @@ -30,7 +30,6 @@ import org.apache.polaris.benchmarks.RetryOnHttpCodes.{ import org.apache.polaris.benchmarks.parameters.{DatasetParameters, WorkloadParameters} import org.slf4j.LoggerFactory import play.api.libs.json.Format.GenericFormat -import play.api.libs.json.OFormat.oFormatFromReadsAndOWrites import play.api.libs.json.{Format, Json} import java.util.concurrent.atomic.AtomicReference @@ -93,6 +92,8 @@ case class TableActions( // See https://github.com/apache/iceberg/issues/10084 val fields: Seq[TableField] = (1 to dp.numColumnsPerTable) .map(id => TableField(id = id, name = s"column$id", `type` = "int", required = true)) + val tableName: String = row("tableName").asInstanceOf[String] + val parentNamespacePath: Seq[String] = row("parentNamespacePath").asInstanceOf[Seq[String]] val properties: Map[String, String] = (0 until dp.numTableProperties) .map(id => s"InitialAttribute_$id" -> s"$id") .toMap @@ -101,6 +102,7 @@ case class TableActions( "schemasIdentifierFieldIds" -> "[1]", "fieldsStr" -> Json.toJson(fields).toString(), "fields" -> fields, + "location" -> s"${dp.defaultBaseLocation}/${parentNamespacePath.mkString("/")}/$tableName", "initialJsonProperties" -> Json.toJson(properties).toString() ) } diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/ViewActions.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/ViewActions.scala index f0938cd..d8bc44c 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/ViewActions.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/ViewActions.scala @@ -81,6 +81,8 @@ case class ViewActions( .map { row => val viewId = row("viewId").asInstanceOf[Int] val tableName = s"T_$viewId" + val viewName = row("viewName").asInstanceOf[String] + val parentNamespacePath: Seq[String] = row("parentNamespacePath").asInstanceOf[Seq[String]] val fields: Seq[ViewField] = (1 to dp.numColumnsPerView) .map(id => ViewField(id = id, name = s"column$id", `type` = "int", required = true)) val properties: Map[String, String] = (0 until dp.numViewProperties) @@ -92,6 +94,7 @@ case class ViewActions( "fieldsStr" -> Json.toJson(fields).toString(), "fields" -> fields, "sqlQuery" -> s"SELECT * FROM $tableName", + "location" -> s"${dp.defaultBaseLocation}/${parentNamespacePath.mkString("/")}/$viewName", "initialJsonProperties" -> Json.toJson(properties).toString() ) } diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/ppc/WikiActions.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/ppc/WikiActions.scala new file mode 100644 index 0000000..b19c3a6 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/actions/ppc/WikiActions.scala @@ -0,0 +1,266 @@ +package org.apache.polaris.benchmarks.actions.ppc + +import com.github.javafaker.Faker +import io.gatling.core.Predef._ +import io.gatling.core.feeder.Feeder +import io.gatling.core.structure.ChainBuilder +import io.gatling.http.Predef._ +import org.apache.polaris.benchmarks.actions._ +import org.apache.polaris.benchmarks.parameters.{DatasetParameters, WorkloadParameters} +import org.apache.polaris.benchmarks.wiki.WikiBuilder +import org.apache.polaris.benchmarks.wiki.content.{ + NamespaceGenerator, + TableGenerator, + ViewGenerator +} +import play.api.libs.json.Json + +import java.util.concurrent.atomic.AtomicReference + +/** + * Actions for performance testing search operations. This class provides methods to create wiki and + * tags for each entity that will then be indexed by Search. + */ +case class WikiActions( + faker: Faker, + dp: DatasetParameters, + wp: WorkloadParameters, + namespaceActions: NamespaceActions, + tableActions: TableActions, + viewActions: ViewActions, + accessToken: AtomicReference[String] +) { + private val logger = org.slf4j.LoggerFactory.getLogger(getClass) + + private val wikiBuilder = WikiBuilder() + + private val namespaceGenerator = NamespaceGenerator(dp.nAryTree, faker) + private val tableGenerator = TableGenerator(dp.nAryTree, faker) + private val viewGenerator = ViewGenerator(dp.nAryTree, faker) + + def namespaceWikiFeeder(): Feeder[Any] = namespaceActions + .namespaceIdentityFeeder() + .map { row => + val namespacePath = row("namespacePath").asInstanceOf[Seq[String]] + val namespaceName = namespacePath.last + val wikiContent = wikiBuilder.buildWiki(namespaceGenerator.newNamespace(namespaceName)) + row ++ Map( + "namespacePathWithSlashes" -> namespacePath.mkString("/"), + "sourceName" -> dp.ppcSourceName, + "wikiContent" -> Json.toJson(wikiContent) + ) + } + + val fetchNamespaceId: ChainBuilder = exec( + http("Fetch Dremio Namespace ID") + .get("/apiv2/source/#{sourceName}/folder/#{namespacePathWithSlashes}/") + .header("Authorization", "Bearer #{accessToken}") + .check(jsonPath("$.id").saveAs("dremioNamespaceId")) + ) + + val fetchNamespaceWikiVersion: ChainBuilder = exec( + http("Fetch Namespace Wiki in Dremio") + .get("/api/v3/catalog/#{dremioNamespaceId}/collaboration/wiki") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .check(jsonPath("$.version").withDefault("null").saveAs("wikiVersion")) + ) + + val createNamespaceWiki: ChainBuilder = exec { session => + val wikiVersion = if (session("wikiVersion").as[String] == "null") { + null + } else { + session("wikiVersion").as[String] + } + session.set("wikiVersion", Json.toJson(wikiVersion)) + }.exec( + http("Create Namespace Wiki in Dremio") + .post("/api/v3/catalog/#{dremioNamespaceId}/collaboration/wiki") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .body( + StringBody( + """{ + | "text": #{wikiContent}, + | "version": #{wikiVersion} + |}""".stripMargin + ) + ) + ) + + def tableWikiFeeder(): Feeder[Any] = tableActions + .tableCreationFeeder() + .map { row => + val tableName = row("tableName").asInstanceOf[String] + val location = row("location").asInstanceOf[String] + val fields = row("fields").asInstanceOf[Seq[TableField]] + val table = tableGenerator.newTable(tableName, location, fields) + val wikiContent = wikiBuilder.buildWiki(table) + row ++ Map( + "parentNamespacePathWithSlashes" -> row("parentNamespacePath") + .asInstanceOf[Seq[String]] + .mkString("/"), + "sourceName" -> dp.ppcSourceName, + "wikiContent" -> Json.toJson(wikiContent), + "tags" -> Json.toJson(table.tags.get) + ) + } + + val fetchTableId: ChainBuilder = exec( + http("Fetch Dremio Table ID") + .get("/apiv2/source/#{sourceName}/folder/#{parentNamespacePathWithSlashes}/") + .header("Authorization", "Bearer #{accessToken}") + .check( + jmesPath("contents.physicalDatasets[?datasetName == '#{tableName}'] | [0].datasetConfig.id") + .saveAs("dremioTableId") + ) + ) + + val fetchTableWikiVersion: ChainBuilder = exec( + http("Fetch Table Wiki in Dremio") + .get("/api/v3/catalog/#{dremioTableId}/collaboration/wiki") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .check(jsonPath("$.version").withDefault("null").saveAs("wikiVersion")) + ) + + val createTableWiki: ChainBuilder = exec { session => + val wikiVersion: String = if (session("wikiVersion").as[String] == "null") { + null + } else { + session("wikiVersion").as[String] + } + session.set("wikiVersion", Json.toJson(wikiVersion)) + }.exec( + http("Create Table Wiki in Dremio") + .post("/api/v3/catalog/#{dremioTableId}/collaboration/wiki") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .body( + StringBody( + """{ + | "text": #{wikiContent}, + | "version": #{wikiVersion} + |}""".stripMargin + ) + ) + ) + + val fetchTableTagsVersion: ChainBuilder = exec( + http("Fetch Table Tags in Dremio") + .get("/api/v3/catalog/#{dremioTableId}/collaboration/tag") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .check(jsonPath("$.version").withDefault("null").saveAs("tagsVersion")) + ) + + val createTableTags: ChainBuilder = exec { session => + val tagsVersion: String = if (session("tagsVersion").as[String] == "null") { + null + } else { + session("tagsVersion").as[String] + } + session.set("tagsVersion", Json.toJson(tagsVersion)) + }.exec( + http("Create Table Tags in Dremio") + .post("/api/v3/catalog/#{dremioTableId}/collaboration/tag") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .body( + StringBody( + """{ + | "tags": #{tags}, + | "version": #{tagsVersion} + |}""".stripMargin + ) + ) + ) + + def viewWikiFeeder(): Feeder[Any] = viewActions + .viewCreationFeeder() + .map { row => + val viewName = row("viewName").asInstanceOf[String] + val location = row("location").asInstanceOf[String] + val fields = row("fields").asInstanceOf[Seq[ViewField]] + val view = viewGenerator.newView(viewName, location, fields) + val wikiContent = wikiBuilder.buildWiki(view) + row ++ Map( + "parentNamespacePathWithSlashes" -> row("parentNamespacePath") + .asInstanceOf[Seq[String]] + .mkString("/"), + "sourceName" -> dp.ppcSourceName, + "wikiContent" -> Json.toJson(wikiContent), + "tags" -> Json.toJson(view.tags.get) + ) + } + + val fetchViewId: ChainBuilder = exec( + http("Fetch Dremio View ID") + .get("/apiv2/source/#{sourceName}/folder/#{parentNamespacePathWithSlashes}/") + .header("Authorization", "Bearer #{accessToken}") + .check( + jmesPath("contents.datasets[?datasetName == '#{viewName}'] | [0].datasetConfig.id") + .saveAs("dremioViewId") + ) + ) + + val fetchViewWikiVersion: ChainBuilder = exec( + http("Fetch View Wiki in Dremio") + .get("/api/v3/catalog/#{dremioViewId}/collaboration/wiki") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .check(jsonPath("$.version").withDefault("null").saveAs("wikiVersion")) + ) + + val createViewWiki: ChainBuilder = exec { session => + val wikiVersion: String = if (session("wikiVersion").as[String] == "null") { + null + } else { + session("wikiVersion").as[String] + } + session.set("wikiVersion", Json.toJson(wikiVersion)) + }.exec( + http("Create View Wiki in Dremio") + .post("/api/v3/catalog/#{dremioViewId}/collaboration/wiki") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .body( + StringBody( + """{ + | "text": #{wikiContent}, + | "version": #{wikiVersion} + |}""".stripMargin + ) + ) + ) + + val fetchViewTagsVersion: ChainBuilder = exec( + http("Fetch View Tags in Dremio") + .get("/api/v3/catalog/#{dremioViewId}/collaboration/tag") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .check(jsonPath("$.version").withDefault("null").saveAs("tagsVersion")) + ) + + val createViewTags: ChainBuilder = exec { session => + val tagsVersion: String = if (session("tagsVersion").as[String] == "null") { + null + } else { + session("tagsVersion").as[String] + } + session.set("tagsVersion", Json.toJson(tagsVersion)) + }.exec( + http("Create View Tags in Dremio") + .post("/api/v3/catalog/#{dremioViewId}/collaboration/tag") + .header("Authorization", "Bearer #{accessToken}") + .header("Content-Type", "application/json") + .body( + StringBody( + """{ + | "tags": #{tags}, + | "version": #{tagsVersion} + |}""".stripMargin + ) + ) + ) +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/BenchmarkConfig.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/BenchmarkConfig.scala index e5af748..228383e 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/BenchmarkConfig.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/BenchmarkConfig.scala @@ -43,6 +43,7 @@ object BenchmarkConfig { val rtdConfig = workload.getConfig("read-tree-dataset") val ctdConfig = workload.getConfig("create-tree-dataset") val rutdConfig = workload.getConfig("read-update-tree-dataset") + val ctwConfig = workload.getConfig("create-tree-wiki") WorkloadParameters( CreateCommitsParameters( @@ -63,6 +64,11 @@ object BenchmarkConfig { rutdConfig.getDouble("read-write-ratio"), rutdConfig.getInt("throughput"), rutdConfig.getInt("duration-in-minutes") + ), + CreateTreeWikiParameters( + ctwConfig.getInt("namespace-concurrency"), + ctwConfig.getInt("table-concurrency"), + ctwConfig.getInt("view-concurrency") ) ) } @@ -79,7 +85,8 @@ object BenchmarkConfig { dataset.getInt("views-per-namespace"), dataset.getInt("max-views"), dataset.getInt("columns-per-view"), - dataset.getInt("view-properties") + dataset.getInt("view-properties"), + dataset.getString("ppc-source-name") ) BenchmarkConfig(connectionParams, workloadParams, datasetParams) diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/CreateTreeWikiParameters.scala similarity index 55% copy from benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala copy to benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/CreateTreeWikiParameters.scala index b392870..77023e7 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/CreateTreeWikiParameters.scala @@ -19,9 +19,19 @@ package org.apache.polaris.benchmarks.parameters -case class WorkloadParameters( - createCommits: CreateCommitsParameters, - readTreeDataset: ReadTreeDatasetParameters, - createTreeDataset: CreateTreeDatasetParameters, - readUpdateTreeDataset: ReadUpdateTreeDatasetParameters -) {} +/** + * Case class to hold the parameters for the CreateTreeWiki simulation. + * + * @param namespaceConcurrency The number of namespace operations to perform simultaneously. + * @param tableConcurrency The number of table operations to perform simultaneously. + * @param viewConcurrency The number of view operations to perform simultaneously. + */ +case class CreateTreeWikiParameters( + namespaceConcurrency: Int, + tableConcurrency: Int, + viewConcurrency: Int +) { + require(namespaceConcurrency >= 0, "Namespace concurrency cannot be negative") + require(tableConcurrency >= 0, "Table concurrency cannot be negative") + require(viewConcurrency >= 0, "View concurrency cannot be negative") +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/DatasetParameters.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/DatasetParameters.scala index 4d1e306..b10b0ce 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/DatasetParameters.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/DatasetParameters.scala @@ -49,7 +49,8 @@ case class DatasetParameters( numViewsPerNs: Int, numViewsMax: Int, numColumnsPerView: Int, - numViewProperties: Int + numViewProperties: Int, + ppcSourceName: String ) { val nAryTree: NAryTreeBuilder = NAryTreeBuilder(nsWidth, nsDepth) private val maxPossibleTables = nAryTree.numberOfLastLevelElements * numTablesPerNs diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala index b392870..9580de1 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala @@ -23,5 +23,6 @@ case class WorkloadParameters( createCommits: CreateCommitsParameters, readTreeDataset: ReadTreeDatasetParameters, createTreeDataset: CreateTreeDatasetParameters, - readUpdateTreeDataset: ReadUpdateTreeDatasetParameters + readUpdateTreeDataset: ReadUpdateTreeDatasetParameters, + createTreeWiki: CreateTreeWikiParameters ) {} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/CreateTreeWiki.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/CreateTreeWiki.scala new file mode 100644 index 0000000..58e5d96 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/simulations/CreateTreeWiki.scala @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.benchmarks.simulations + +import com.github.javafaker.Faker +import io.gatling.core.Predef._ +import io.gatling.core.structure.ScenarioBuilder +import io.gatling.http.Predef._ +import org.apache.polaris.benchmarks.actions._ +import org.apache.polaris.benchmarks.actions.ppc.WikiActions +import org.apache.polaris.benchmarks.parameters.BenchmarkConfig.config +import org.apache.polaris.benchmarks.parameters.WorkloadParameters +import org.slf4j.LoggerFactory + +import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReference} +import java.util.{Locale, Random} +import scala.concurrent.duration.DurationInt + +class CreateTreeWiki extends Simulation { + private val logger = LoggerFactory.getLogger(getClass) + + // -------------------------------------------------------------------------------- + // Load parameters + // -------------------------------------------------------------------------------- + private val cp = config.connectionParameters + private val dp = config.datasetParameters + val wp: WorkloadParameters = config.workloadParameters + + // -------------------------------------------------------------------------------- + // Helper values + // -------------------------------------------------------------------------------- + private val faker = new Faker(Locale.ENGLISH, new Random(0)) + + private val numNamespaces: Int = dp.nAryTree.numberOfNodes + private val accessToken: AtomicReference[String] = new AtomicReference() + private val shouldRefreshToken: AtomicBoolean = new AtomicBoolean(true) + + private val authenticationActions = AuthenticationActions(cp, accessToken) + private val namespaceActions = NamespaceActions(dp, wp, accessToken) + private val tableActions = TableActions(dp, wp, accessToken) + private val viewActions = ViewActions(dp, wp, accessToken) + private val wikiActions = + WikiActions(faker, dp, wp, namespaceActions, tableActions, viewActions, accessToken) + + private val createdNamespaceWikis = new AtomicInteger() + private val createdTableWikis = new AtomicInteger() + private val createdViewWikis = new AtomicInteger() + + // -------------------------------------------------------------------------------- + // Authentication related workloads: + // * Authenticate and store the access token for later use every minute + // * Wait for an OAuth token to be available + // * Stop the token refresh loop + // -------------------------------------------------------------------------------- + val continuouslyRefreshOauthToken: ScenarioBuilder = + scenario("Authenticate every minute using the Iceberg REST API") + .asLongAs(_ => shouldRefreshToken.get()) { + feed(authenticationActions.feeder()) + .exec(authenticationActions.authenticateAndSaveAccessToken) + .pause(5.second) + } + + val waitForAuthentication: ScenarioBuilder = + scenario("Wait for the authentication token to be available") + .asLongAs(_ => accessToken.get() == null) { + pause(1.second) + } + + val stopRefreshingToken: ScenarioBuilder = + scenario("Stop refreshing the authentication token") + .exec { session => + shouldRefreshToken.set(false) + session + } + + // -------------------------------------------------------------------------------- + // Workload: Create wiki content for namespaces, tables and views + // -------------------------------------------------------------------------------- + private val createNamespaceWiki = scenario("Fetch Dremio Namespace ID") + .exec(authenticationActions.restoreAccessTokenInSession) + .asLongAs(session => + createdNamespaceWikis.getAndIncrement() < numNamespaces && session.contains("accessToken") + ) { + exec(authenticationActions.restoreAccessTokenInSession) + .feed(wikiActions.namespaceWikiFeeder()) + .exec(wikiActions.fetchNamespaceId) + .exec(wikiActions.fetchNamespaceWikiVersion) + .exec(wikiActions.createNamespaceWiki) + } + + private val fetchDremioTableIds = scenario("Fetch Dremio Table ID") + .exec(authenticationActions.restoreAccessTokenInSession) + .asLongAs(session => + createdTableWikis.getAndIncrement() < dp.numTables && session.contains("accessToken") + ) { + exec(authenticationActions.restoreAccessTokenInSession) + .feed(wikiActions.tableWikiFeeder()) + .exec(wikiActions.fetchTableId) + .exec(wikiActions.fetchTableWikiVersion) + .exec(wikiActions.createTableWiki) + .exec(wikiActions.fetchTableTagsVersion) + .exec(wikiActions.createTableTags) + } + + private val fetchDremioViewIds = scenario("Fetch Dremio View ID") + .exec(authenticationActions.restoreAccessTokenInSession) + .asLongAs(session => + createdViewWikis.getAndIncrement() < dp.numViews && session.contains("accessToken") + ) { + exec(authenticationActions.restoreAccessTokenInSession) + .feed(wikiActions.viewWikiFeeder()) + .exec(wikiActions.fetchViewId) + .exec(wikiActions.fetchViewWikiVersion) + .exec(wikiActions.createViewWiki) + .exec(wikiActions.fetchViewTagsVersion) + .exec(wikiActions.createViewTags) + } + + // -------------------------------------------------------------------------------- + // Build up the HTTP protocol configuration and set up the simulation + // -------------------------------------------------------------------------------- + private val dremioHttpProtocol = http + .baseUrl(cp.dremioBaseUrl) + .acceptHeader("application/json") + .contentTypeHeader("application/json") + + // Get the configured concurrency for namespaces, tables and views + private val namespaceConcurrency = wp.createTreeWiki.namespaceConcurrency + private val tableConcurrency = wp.createTreeWiki.tableConcurrency + private val viewConcurrency = wp.createTreeWiki.viewConcurrency + + setUp( + continuouslyRefreshOauthToken.inject(atOnceUsers(1)).protocols(dremioHttpProtocol), + waitForAuthentication + .inject(atOnceUsers(1)) + .andThen( + createNamespaceWiki + .inject(atOnceUsers(namespaceConcurrency)) + .protocols(dremioHttpProtocol) + ) + .andThen( + fetchDremioTableIds.inject(atOnceUsers(tableConcurrency)).protocols(dremioHttpProtocol) + ) + .andThen( + fetchDremioViewIds.inject(atOnceUsers(viewConcurrency)).protocols(dremioHttpProtocol) + ) + .andThen(stopRefreshingToken.inject(atOnceUsers(1)).protocols(dremioHttpProtocol)) + ) +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/WikiBuilder.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/WikiBuilder.scala new file mode 100644 index 0000000..1a06643 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/WikiBuilder.scala @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.polaris.benchmarks.wiki + +import org.apache.polaris.benchmarks.wiki.entities.Searchable + +/** + * A builder class for creating markdown content. + */ +case class WikiBuilder() { + + /** + * Builds a complete wiki for the given entity. + * + * @param entity The entity to build a wiki for + * @return The complete wiki content as a string + */ + def buildWiki(entity: Searchable): String = { + val wiki = new StringBuilder() + + // Add header + wiki.append(s"# Wiki for the ${entity.typeStr()}: ${entity.name}\n\n") + + // Add overview section + wiki.append("## Entity Overview\n") + wiki.append(s"- **Entity Name:** ${entity.name}\n") + wiki.append(s"- **Entity Type:** ${entity.entityType()}\n") + wiki.append( + s"- **Description:** This ${entity.typeStr()} contains ${entity.containedElements.mkString(", ")}.\n" + ) + wiki.append("\n---\n\n") + + // Add business context section + wiki.append("## Business Context\n") + wiki.append(s"- **Owner:** ${entity.businessContext.owner}\n") + wiki.append(s"- **Use Cases:** ${entity.businessContext.useCases.mkString(", ")}.\n") + wiki.append("- **Stakeholders:** \n") + entity.businessContext.stakeholders.foreach { case (team, usage) => + wiki.append(s" - $team: $usage\n") + } + wiki.append( + s"- **Data Product Relation:** Part of the ${entity.businessContext.dataProductName} data product. This data product ${entity.businessContext.dataProductDescription}.\n" + ) + wiki.append("\n---\n\n") + + // Add technical details section if available + entity.technicalDetails match { + case Some(details) => + wiki.append("## Technical Details\n") + wiki.append(s"- **Source Location:** `${details.sourceLocation}`\n") + wiki.append("- **Schema:**\n") + wiki.append( + " | Column Name | Data Type | Description |\n" + ) + wiki.append( + " |----------------------------------------|----------------------------------------|--------------------------------------------------------------------------------|\n" + ) + entity.columns.foreach { column => + wiki.append( + s" | ${column.name} | ${column.dataType} | ${column.description} |\n" + ) + } + wiki.append(s"- **Update Frequency:** ${details.updateFrequency}\n") + wiki.append(s"- **Retention Policy:** ${details.retentionPolicy}\n") + wiki.append("\n---\n\n") + case None => + // Do nothing if the entity doesn't have technical details + } + + // Add data quality section if available + entity.dataQuality match { + case Some(quality) => + wiki.append("## Data Quality\n") + + // Collect constraints from all columns + val columnConstraints = entity.columns.flatMap { column => + column.constraints.map(constraint => s"${column.name}: $constraint") + } + + if (columnConstraints.nonEmpty) { + wiki.append("- **Validation Rules:**\n") + columnConstraints.foreach { constraint => + wiki.append(s" - $constraint\n") + } + } + + wiki.append("- **Known Issues:**\n") + quality.knownIssues.foreach { issue => + wiki.append(s" - $issue\n") + } + wiki.append("- **Metrics:**\n") + quality.metrics.foreach { case (metricName, metricValue) => + wiki.append(s" - $metricName: $metricValue\n") + } + wiki.append("\n---\n\n") + case None => + // Do nothing if the entity doesn't have data quality information + } + + // Add security and permissions section if available + entity.securityAndPermissions match { + case Some(security) => + wiki.append("## Security & Permissions\n") + wiki.append("- **Access Controls:**\n") + wiki.append(s" - Read Access: ${security.readAccess.mkString(", ")}.\n") + wiki.append(s" - Write Access: ${security.writeAccess.mkString(", ")}.\n") + wiki.append(s"- **Data Sensitivity:** ${security.dataSensitivity}\n\n") + wiki.append("---\n\n") + case None => + // Do nothing if the entity doesn't have security and permissions information + } + + // Add lineage section if available + entity.lineage match { + case Some(lineage) => + wiki.append("## Lineage\n") + wiki.append("- **Upstream Sources:**\n") + lineage.upstreamSources.foreach { case (source, description) => + wiki.append(s" - `$source`: $description\n") + } + wiki.append("- **Downstream Consumers:**\n") + lineage.downstreamConsumers.foreach { case (consumer, description) => + wiki.append(s" - `$consumer`: $description\n") + } + wiki.append("\n---\n\n") + case None => + // Do nothing if the entity doesn't have lineage information + } + + // Add SQL examples section if available + entity.sqlExamples match { + case Some(examples) if examples.nonEmpty => + wiki.append("## Query Examples\n```sql\n") + examples.foreach { example => + wiki.append(s"$example\n\n") + } + wiki.append("```\n\n---\n\n") + case _ => + // Do nothing if the entity doesn't have SQL examples + } + + // Add change history section + wiki.append("## Change History\n") + wiki.append(s"- **Last Updated:** ${entity.changeHistory.lastUpdated}\n") + wiki.append("- **Changelog:**\n") + entity.changeHistory.changelog.foreach { case (date, change) => + wiki.append(s" - $date: $change\n") + } + wiki.append("\n---\n\n") + + // Add contact information section + wiki.append("## Contact Information\n") + wiki.append(s"- **Primary Contact:** ${entity.contactInformation.primaryContact}\n") + wiki.append("- **Support Channels:** \n") + entity.contactInformation.supportChannels.foreach { case (channel, details) => + wiki.append(s" - $channel: $details\n") + } + wiki.append("\n---\n\n") + + wiki.toString() + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/BusinessContextGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/BusinessContextGenerator.scala new file mode 100644 index 0000000..9a0a1ac --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/BusinessContextGenerator.scala @@ -0,0 +1,40 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker +import org.apache.polaris.benchmarks.wiki.entities.BusinessContext + +import java.util.concurrent.atomic.AtomicInteger + +case class BusinessContextGenerator(faker: Faker) extends Function0[BusinessContext] { + private val index = new AtomicInteger(0) + + private val responsibilitiesPrefixes: IndexedSeq[String] = Vector( + "Owns and maintains the ", + "Responsible for the quality and accuracy of the ", + "Ensures the security and compliance of the ", + "Manages the ingestion and processing of data for the ", + "Provides support and troubleshooting for issues related to the ", + "Analyzes customer behaviour for ", + "Validates revenue and reconciliation processes for " + ) + + override def apply(): BusinessContext = { + val i = index.getAndIncrement() + val owner = faker.app().name() + val useCases = (0 until i % 3 + 1).map(_ => faker.company().bs()) + + val stakeholders = (0 until i % 3 + 1).map { j => + val prefixIndex = (i + j) % responsibilitiesPrefixes.size + val prefix = responsibilitiesPrefixes(prefixIndex) + faker.pokemon().name() -> (prefix + faker.company().catchPhrase()) + }.toMap + + BusinessContext( + s"$owner Data Engineering Team", + useCases, + stakeholders, + faker.ancient().god(), + s"provides a holistic view of ${faker.company().industry()} operations" + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ChangeHistoryGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ChangeHistoryGenerator.scala new file mode 100644 index 0000000..5593554 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ChangeHistoryGenerator.scala @@ -0,0 +1,76 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker +import org.apache.polaris.benchmarks.wiki.entities.{ChangeHistory, Column} + +import java.time.LocalDate +import java.time.format.DateTimeFormatter +import java.util.concurrent.atomic.AtomicInteger + +case class ChangeHistoryGenerator(faker: Faker) extends Function1[Seq[Column], ChangeHistory] { + private val index = new AtomicInteger(0) + private val dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd") + + private val changeActionPrefixes: IndexedSeq[String] = Vector( + "Added column ", + "Modified column ", + "Renamed column ", + "Changed data type of column ", + "Added constraint to column ", + "Removed constraint from column ", + "Updated description for column ", + "Optimized indexing for column ", + "Added foreign key reference to column ", + "Changed nullability of column " + ) + + private val changeActionSuffixes: IndexedSeq[String] = Vector( + " to improve query performance", + " to support new business requirements", + " to fix data quality issues", + " to align with data governance standards", + " to support new analytics use case", + " to comply with security requirements", + " to enable better reporting", + " to fix a bug in data processing", + " to support integration with new system", + " to improve data consistency" + ) + + override def apply(columns: Seq[Column]): ChangeHistory = { + val i = index.getAndIncrement() + + // Base date for the most recent change + val baseDate = LocalDate.now().minusDays(i % 30) + + // Determine number of changes (1-10) + val numChanges = 1 + (i % 10) + + // Generate changelog entries + val changelog = (0 until numChanges).map { j => + val changeDate = baseDate.minusDays(j * (7 + (i % 5))).format(dateFormatter) + val funnyName = faker.funnyName().name() + val suffixIndex = (i + j) % changeActionSuffixes.size + val changeSuffix = changeActionSuffixes(suffixIndex) + + val changeDescription = if (columns.isEmpty) { + s"$funnyName requested$changeSuffix." + } else { + val column = columns((i + j) % columns.size).name + val prefixIndex = (i + j) % changeActionPrefixes.size + val changePrefix = changeActionPrefixes(prefixIndex) + s"$changePrefix`$column`$changeSuffix. Requested by $funnyName" + } + + (changeDate, changeDescription) + }.toList + + // Most recent date is the first entry in the changelog + val lastUpdated = if (changelog.nonEmpty) changelog.head._1 else baseDate.format(dateFormatter) + + ChangeHistory( + lastUpdated = lastUpdated, + changelog = changelog + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ColumnGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ColumnGenerator.scala new file mode 100644 index 0000000..0a9a85c --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ColumnGenerator.scala @@ -0,0 +1,153 @@ +package org.apache.polaris.benchmarks.wiki.content + +import org.apache.polaris.benchmarks.wiki.entities.Column + +import java.util.concurrent.atomic.AtomicInteger + +case class ColumnGenerator() extends Function[String, Column] { + private val index = new AtomicInteger(0) + + private val databaseTypes: IndexedSeq[String] = Vector( + "INT", + "INTEGER", + "SMALLINT", + "BIGINT", + "NUMERIC(10,2)", + "DECIMAL(10,2)", + "TINYINT", + "FLOAT", + "REAL", + "DOUBLE PRECISION", + "CHAR(50)", + "VARCHAR(255)", + "TEXT", + "BINARY(100)", + "VARBINARY(255)", + "BLOB", + "DATE", + "TIME", + "TIMESTAMP", + "DATETIME", + "INTERVAL", + "BOOLEAN", + "GEOMETRY", + "GEOGRAPHY", + "JSON", + "JSONB", + "UUID", + "XML", + "ARRAY", + "ENUM", + "CLOB", + "NCLOB", + "INET", + "CIDR", + "MACADDR" + ) + + private val objectTypes: IndexedSeq[String] = Vector( + "Entity", + "Record", + "Class", + "Object", + "Group", + "Instance", + "Row", + "Customer", + "Product", + "Transaction", + "Order", + "Invoice", + "Payment", + "Shipment", + "Review", + "Rating", + "Comment", + "Like", + "Dislike", + "Share", + "Follow", + "Subscribe", + "Unsubscribe", + "Unfollow", + "Unlike", + "Unshare" + ) + + private val descriptionPrefixes: IndexedSeq[String] = Vector( + "Unique identifier for this ", + "Name of the ", + "Creation date of the ", + "Last modified timestamp of the ", + "User who created this ", + "Status (active, inactive, pending) of the ", + "Numeric value representing quantity in this ", + "Number of units purchased in this ", + "Date and time of the transation pertaining to this ", + "Total transaction amount in USD for this ", + "Price or monetary value of the ", + "Geographic coordinates of the ", + "JSON metadata for additional properties belonging to this ", + "Binary data for attachments in this ", + "Reference to parent ", + "Configuration settings for this ", + "Textual description or notes about this ", + "Email address for contact about this ", + "URL or web address for this ", + "Boolean flag indicating status of this ", + "Enumerated type with predefined values for this ", + "IP address for network identification of this ", + "Version number or identifier for this " + ) + + private val constraints: IndexedSeq[String] = Vector( + " must not be null", + " must not be empty", + " must be unique", + " must be one of the allowed values", + " must not be null and must be unique", + " must be greater than 0", + " must be greater than or equal to 0", + " must be lower than 0", + " must be lower than or equal to 0", + " must be equal to the square of the hypotenuse", + " must be located in the first quadrant", + " must be located in a valid European country", + " must be located in a valid US country", + " must be located in a valid APAC country", + " must be located in a valid EMEA country", + " must be located in a valid US state", + " must be located in a valid city", + " must be located in a valid zip code of Seattle", + " must be located in a valid zip code of San Francisco", + " must be located in a valid neighborhood", + " must be located within a valid polygon", + " must be located within a 25km radius of the center of the city", + " must be located within a 50km radius of the center of the city", + " must be a class C private IP address", + " must be an IPv4 IP address", + " must be an IPv6 IP address" + ) + + override def apply(columnName: String): Column = { + val i = index.getAndIncrement() + + // Generate description by combining a prefix with an object type + val objectTypeIndex = i % objectTypes.size + val prefixIndex = (i / objectTypes.size) % descriptionPrefixes.size + val description = descriptionPrefixes(prefixIndex) + objectTypes(objectTypeIndex) + + // Randomly decide if we should add constraints (30% chance) + val numConstraints = if (i % 10 < 3) 1 + (i % 2) else 0 + val columnConstraints = (0 until numConstraints).map { j => + constraints((i + j) % constraints.size) + } + + Column( + name = columnName, + dataType = databaseTypes(i % databaseTypes.size), + description = description, + constraints = columnConstraints + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ContactInformationGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ContactInformationGenerator.scala new file mode 100644 index 0000000..95983a3 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ContactInformationGenerator.scala @@ -0,0 +1,43 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker +import org.apache.polaris.benchmarks.wiki.entities.{BusinessContext, ContactInformation} + +import java.util.concurrent.atomic.AtomicInteger + +case class ContactInformationGenerator(faker: Faker) + extends Function2[BusinessContext, String, ContactInformation] { + private val index = new AtomicInteger(0) + + /** + * Generates a ContactInformation entity based on business context and table name. + * + * @param bizCtx The business context containing owner information + * @param tableName The name of the table for which contact information is being generated + * @return A ContactInformation entity with generated contact details + */ + override def apply(bizCtx: BusinessContext, tableName: String): ContactInformation = { + val i = index.getAndIncrement() + + val contactName = faker.funnyName().name() + val contactEmail = faker.internet().emailAddress(contactName.replace(" ", ".")) + val primaryContact = s"$contactName <$contactEmail>" + + val numChannels = 1 + (i % 3) + val possibleChannels = Seq( + "Slack" -> s"#${bizCtx.owner.toLowerCase.replaceAll("\\s+", "-")}-${tableName.toLowerCase}", + "Jira" -> s"${bizCtx.owner} Project Board", + "Teams" -> s"${bizCtx.owner}", + "Email" -> s"${bizCtx.owner.toLowerCase.replaceAll("\\s+", ".")}@support.example.com", + "Wiki" -> s"https://wiki.example.com/data/${bizCtx.owner.replaceAll("\\s+", "")}", + "GitHub" -> s"github.com/example/${bizCtx.owner.toLowerCase.replaceAll("\\s+", "-")}-data", + "On-call" -> s"PagerDuty: ${bizCtx.owner} Data Support" + ) + + ContactInformation( + primaryContact = primaryContact, + supportChannels = possibleChannels.take(numChannels).toMap + ) + } + +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ContainedElementsGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ContainedElementsGenerator.scala new file mode 100644 index 0000000..74b3212 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ContainedElementsGenerator.scala @@ -0,0 +1,32 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker + +import java.util.concurrent.atomic.AtomicInteger + +case class ContainedElementsGenerator(faker: Faker) extends Function0[Seq[String]] { + private val index = new AtomicInteger(0) + + private val modelTypes: IndexedSeq[String] = Vector( + "Relational", + "Document", + "Graph", + "Key-Value", + "Columnar", + "Time-series", + "Wide-column" + ) + + override def apply(): Seq[String] = { + val i = index.getAndIncrement() + val animal = faker.animal().name() + val beer = faker.beer().name() + val book = faker.book().title() + val model = modelTypes(i % modelTypes.size) + Seq( + s"$model data for $book reader statistics", + s"$beer sales data", + s"$animal population data" + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/DataQualityGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/DataQualityGenerator.scala new file mode 100644 index 0000000..0bf050b --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/DataQualityGenerator.scala @@ -0,0 +1,35 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker +import org.apache.polaris.benchmarks.wiki.entities.DataQuality + +import java.util.concurrent.atomic.AtomicInteger + +case class DataQualityGenerator(faker: Faker) extends Function0[DataQuality] { + private val index = new AtomicInteger(0) + + private val issuesSuffixes: IndexedSeq[String] = Vector( + " part of the data has not been ingested correctly.", + " subset of the data is missing.", + " subset of the data is duplicated.", + " subset of the data is outdated and not maintained anymore.", + " group of the data is not properly aligned.", + " group is corrupted beyond repair.", + " was entered as a joke and somehow made it to production." + ) + + override def apply(): DataQuality = { + val i = index.getAndIncrement() + val numIssues: Int = 3 - math.pow(i % 5, 2).toInt + val issues = (0 until numIssues).map { j => + faker.funnyName().name() + issuesSuffixes((i + j) % issuesSuffixes.size) + } + DataQuality( + knownIssues = issues.toList, + metrics = Map( + "Average daily transaction count" -> math.pow(3, i % 10).toInt.toString, + "Monthly data growth" -> s"${i % 10}.${i % 99} GB" + ) + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/LineageGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/LineageGenerator.scala new file mode 100644 index 0000000..9a80d92 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/LineageGenerator.scala @@ -0,0 +1,53 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker +import org.apache.polaris.benchmarks.wiki.entities.Lineage + +import java.util.concurrent.atomic.AtomicInteger + +case class LineageGenerator(faker: Faker) extends Function2[Seq[String], Seq[String], Lineage] { + private val index = new AtomicInteger(0) + + private val upstreamDescriptionPrefixes: IndexedSeq[String] = Vector( + "Provides raw data for ", + "Source of customer information for ", + "Contains reference data used by ", + "Supplies transaction details for ", + "Feeds product metadata into ", + "Provides dimension data for ", + "Source of metrics used in " + ) + + private val downstreamDescriptionPrefixes: IndexedSeq[String] = Vector( + "Consumes data for analytics dashboards about ", + "Uses this data for reporting on ", + "Aggregates information for business insights on ", + "Transforms data for downstream ML models about ", + "Feeds into executive dashboards showing ", + "Powers customer-facing reports on ", + "Drives automated decision making for " + ) + + override def apply(upstreamNames: Seq[String], downstreamNames: Seq[String]): Lineage = { + val i = index.getAndIncrement() + + // Generate descriptions for upstream sources + val upstreamSources = upstreamNames.zipWithIndex.map { case (name, j) => + val suffix = faker.commerce().productName() + val prefixIndex = (i + j) % upstreamDescriptionPrefixes.size + name -> (upstreamDescriptionPrefixes(prefixIndex) + suffix) + }.toMap + + // Generate descriptions for downstream consumers + val downstreamConsumers = downstreamNames.zipWithIndex.map { case (name, j) => + val suffix = faker.commerce().department() + val prefixIndex = (i + j) % downstreamDescriptionPrefixes.size + name -> (downstreamDescriptionPrefixes(prefixIndex) + suffix) + }.toMap + + Lineage( + upstreamSources = upstreamSources, + downstreamConsumers = downstreamConsumers + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/NamespaceGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/NamespaceGenerator.scala new file mode 100644 index 0000000..877b180 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/NamespaceGenerator.scala @@ -0,0 +1,27 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker +import org.apache.polaris.benchmarks.NAryTreeBuilder +import org.apache.polaris.benchmarks.wiki.entities.Namespace + +case class NamespaceGenerator(nAryTree: NAryTreeBuilder, faker: Faker) { + private val businessContextGenerator = BusinessContextGenerator(faker) + private val containedElementsGenerator = ContainedElementsGenerator(faker) + private val changeHistoryGenerator = ChangeHistoryGenerator(faker) + private val contactInformationGenerator = ContactInformationGenerator(faker) + + def newNamespace(namespaceName: String): Namespace = { + val containedElements = containedElementsGenerator.apply() + val businessContext = businessContextGenerator.apply() + val changeHistory = changeHistoryGenerator.apply(Seq.empty) // No columns for namespace + val contactInformation = contactInformationGenerator.apply(businessContext, namespaceName) + + Namespace( + name = namespaceName, + containedElements = containedElements, + businessContext = businessContext, + changeHistory = changeHistory, + contactInformation = contactInformation + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/SecurityAndPermissionsGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/SecurityAndPermissionsGenerator.scala new file mode 100644 index 0000000..9c3d40f --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/SecurityAndPermissionsGenerator.scala @@ -0,0 +1,57 @@ +package org.apache.polaris.benchmarks.wiki.content + +import org.apache.polaris.benchmarks.wiki.entities.SecurityAndPermissions + +import java.util.concurrent.atomic.AtomicInteger + +case class SecurityAndPermissionsGenerator() extends Function0[SecurityAndPermissions] { + private val index = new AtomicInteger(0) + + private val teamsSeq: IndexedSeq[String] = Vector( + "Data Engineering", + "Data Science", + "Analytics", + "Business Intelligence", + "Marketing", + "Sales", + "Finance", + "HR", + "Product", + "Engineering", + "Operations", + "Security", + "Compliance" + ) + + private val sensitivitySeq: IndexedSeq[String] = Vector( + "Contains no PII, suitable for general access", + "Contains business-sensitive information, restricted to internal teams only", + "Contains customer data with PII, must be handled according to privacy regulations", + "Contains financial data, access restricted to Finance and authorized personnel", + "Contains proprietary algorithms and models, restricted to Data Science team", + "Contains aggregated metrics, suitable for cross-team usage", + "Contains operational metrics, suitable for Operations team", + "High business impact data, requires approval for access", + "Low sensitivity, available for all internal users" + ) + + override def apply(): SecurityAndPermissions = { + val i = index.getAndIncrement() + + // Generate 1-3 teams for read access + val readTeams = (0 until 1 + (i % 3)).map { j => + teamsSeq((i + j) % teamsSeq.size) + " Team" + }.toList + + // Generate 1-2 teams for write access (typically fewer than read) + val writeTeams = (0 until 1 + (i % 2)).map { j => + teamsSeq((i + j + 3) % teamsSeq.size) + " Team" + }.toList + + SecurityAndPermissions( + readAccess = readTeams, + writeAccess = writeTeams, + dataSensitivity = sensitivitySeq(i % sensitivitySeq.size) + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/SqlQueriesGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/SqlQueriesGenerator.scala new file mode 100644 index 0000000..f063f9c --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/SqlQueriesGenerator.scala @@ -0,0 +1,121 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker +import org.apache.polaris.benchmarks.wiki.entities.Column + +import java.util.concurrent.atomic.AtomicInteger + +case class SqlQueriesGenerator(faker: Faker) extends Function2[String, Seq[Column], Seq[String]] { + private val index = new AtomicInteger(0) + + /** + * Generates example SQL queries for a table based on its columns. + * + * @param tableName The name of the table + * @param columns The columns in the table + * @return A sequence of SQL query examples + */ + override def apply(tableName: String, columns: Seq[Column]): Seq[String] = { + val i = index.getAndIncrement() + + // Determine how many queries to generate (1-3) + val numQueries = 1 + (i % 3) + + // Get a subset of columns to use in queries + val availableColumns = + if (columns.nonEmpty) columns + else + Seq(Column("id", "INTEGER", "Primary key"), Column("name", "VARCHAR", "Name field")) + + // Generate the queries + (0 until numQueries).map { + case 0 => generateSelectQuery(tableName, availableColumns, i) + case 1 => generateAggregateQuery(tableName, availableColumns, i) + case 2 => generateJoinQuery(tableName, availableColumns, i) + case _ => generateSelectQuery(tableName, availableColumns, i) // Fallback + } + } + + private def generateSelectQuery(tableName: String, columns: Seq[Column], i: Int): String = { + val selectedColumns = columns.take(math.min(10, columns.size)) + val columnNames = selectedColumns.map(_.name).mkString(",\n ") + val whereColumn = selectedColumns.map(_.name).head + val condition = getConditionForColumn(i) + + s"""-- Example: Basic select with filtering +SELECT + $columnNames +FROM + $tableName +WHERE + $whereColumn $condition +LIMIT 100;""" + } + + private def generateAggregateQuery(tableName: String, columns: Seq[Column], i: Int): String = { + val aggregateColumn = columns + .filter(c => + c.dataType.toUpperCase.contains("INT") || + c.dataType.toUpperCase.contains("NUMERIC") || + c.dataType.toUpperCase.contains("DECIMAL") || + c.dataType.toUpperCase.contains("FLOAT") || + c.dataType.toUpperCase.contains("DOUBLE") + ) + .head + + val groupByColumn = columns + .find(c => c != aggregateColumn) + .get + + s"""-- Example: Aggregation query with grouping +SELECT + ${groupByColumn.name}, + COUNT(*) AS count, + SUM(${aggregateColumn.name}) AS total, + AVG(${aggregateColumn.name}) AS average +FROM + $tableName +GROUP BY + ${groupByColumn.name} +ORDER BY + total DESC +LIMIT 10;""" + } + + private def generateJoinQuery(tableName: String, columns: Seq[Column], i: Int): String = { + val joinColumn = columns.map(_.name).head + val relatedTable = s"other_table" + + val columnNames = + columns.take(math.min(10, columns.size)).map(c => s"t.${c.name}").mkString(", ") + + s"""-- Example: Join with related table +SELECT + $columnNames, + r.name AS related_name, + r.description AS related_description +FROM + $tableName t +JOIN + $relatedTable r ON t.$joinColumn = r.${tableName}_id +WHERE + r.active = true +ORDER BY + t.$joinColumn +LIMIT 50;""" + } + + private def getConditionForColumn(i: Int): String = { + val conditions = Seq( + s"= '${faker.commerce().productName()}'", + s"LIKE '%${faker.lorem().word()}%'", + s"> ${i % 100}", + s"< ${(i % 100) + 100}", + s"BETWEEN ${i % 50} AND ${(i % 50) + 50}", + s"IN ('${faker.commerce().material()}', '${faker.commerce().material()}')", + "IS NOT NULL" + ) + + conditions(i % conditions.size) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/TableGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/TableGenerator.scala new file mode 100644 index 0000000..59bf086 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/TableGenerator.scala @@ -0,0 +1,57 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker +import org.apache.polaris.benchmarks.NAryTreeBuilder +import org.apache.polaris.benchmarks.actions.TableField +import org.apache.polaris.benchmarks.wiki.entities.Table + +case class TableGenerator(nAryTree: NAryTreeBuilder, faker: Faker) { + private val columnGenerator = ColumnGenerator() + private val businessContextGenerator = BusinessContextGenerator(faker) + private val containedElementsGenerator = ContainedElementsGenerator(faker) + private val technicalDetailsGenerator = TechnicalDetailsGenerator() + private val dataQualityGenerator = DataQualityGenerator(faker) + private val securityAndPermissionsGenerator = SecurityAndPermissionsGenerator() + private val lineageGenerator = LineageGenerator(faker) + private val sqlQueriesGenerator = SqlQueriesGenerator(faker) + private val changeHistoryGenerator = ChangeHistoryGenerator(faker) + private val contactInformationGenerator = ContactInformationGenerator(faker) + private val tagGenerator = TagGenerator(faker) + + def newTable(tableName: String, sourceLocation: String, fields: Seq[TableField]): Table = { + val tableOrdinal = tableName.split("_").last.toInt + val columns = fields.map(_.name).map(columnGenerator.apply) + + val containedElements = containedElementsGenerator.apply() + val businessContext = businessContextGenerator.apply() + val technicalDetails = technicalDetailsGenerator.apply(sourceLocation) + val dataQuality = dataQualityGenerator.apply() + val securityAndPermissions = securityAndPermissionsGenerator.apply() + + val upstreamSources: Seq[String] = + nAryTree.pathToRoot(tableOrdinal).map(ordinal => s"NS_$ordinal") + val downstreamSources: Seq[String] = nAryTree.siblingsOf(tableOrdinal).map(i => s"T_${i}") + val lineage = lineageGenerator.apply(upstreamSources, downstreamSources) + + val sqlQueries = sqlQueriesGenerator.apply(tableName, columns) + val changeHistory = changeHistoryGenerator.apply(columns) + val contactInformation = contactInformationGenerator.apply(businessContext, tableName) + + val tableTags = tagGenerator.apply() + + Table( + name = tableName, + columns = columns, + containedElements = containedElements, + businessContext = businessContext, + details = technicalDetails, + quality = dataQuality, + secAndPerms = securityAndPermissions, + lin = lineage, + sqlQueries = sqlQueries, + changeHistory = changeHistory, + contactInformation = contactInformation, + tableTags = tableTags + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/TagGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/TagGenerator.scala new file mode 100644 index 0000000..138ef76 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/TagGenerator.scala @@ -0,0 +1,23 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker + +import java.util.concurrent.atomic.AtomicInteger + +case class TagGenerator(faker: Faker) extends Function0[Seq[String]] { + private val index = new AtomicInteger(0) + + override def apply(): Seq[String] = { + val i = index.getAndIncrement() + + val numTags = 1 + (i % 3) + (0 until numTags).map { j => + val sup = faker.superhero() + val dogName = faker.dog().name() + // Replace spaces by dashes, then in a separate step remove duplicate dashes as some powers contain ` - ` strings + s"${sup.prefix()} ${sup.power()} ${sup.suffix()} $dogName" + .replaceAll("\\s+", "-") + .replaceAll("-+", "-") + } + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/TechnicalDetailsGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/TechnicalDetailsGenerator.scala new file mode 100644 index 0000000..3a22355 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/TechnicalDetailsGenerator.scala @@ -0,0 +1,19 @@ +package org.apache.polaris.benchmarks.wiki.content + +import org.apache.polaris.benchmarks.wiki.entities.TechnicalDetails + +import java.util.concurrent.atomic.AtomicInteger + +case class TechnicalDetailsGenerator() extends Function[String, TechnicalDetails] { + private val index = new AtomicInteger(0) + + override def apply(sourceLocation: String): TechnicalDetails = { + val i = index.getAndIncrement() + TechnicalDetails( + sourceLocation = sourceLocation, + updateFrequency = + s"${1 + (i % 5)} times a day (ingested at least once at 0${i % 9}:${10 + (i % 45)} AM UTC)", + retentionPolicy = s"Data retained for ${i % 10} years" + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ViewGenerator.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ViewGenerator.scala new file mode 100644 index 0000000..b37d65a --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/content/ViewGenerator.scala @@ -0,0 +1,56 @@ +package org.apache.polaris.benchmarks.wiki.content + +import com.github.javafaker.Faker +import org.apache.polaris.benchmarks.NAryTreeBuilder +import org.apache.polaris.benchmarks.actions.ViewField +import org.apache.polaris.benchmarks.wiki.entities.View + +case class ViewGenerator(nAryTree: NAryTreeBuilder, faker: Faker) { + private val columnGenerator = ColumnGenerator() + private val businessContextGenerator = BusinessContextGenerator(faker) + private val containedElementsGenerator = ContainedElementsGenerator(faker) + private val technicalDetailsGenerator = TechnicalDetailsGenerator() + private val dataQualityGenerator = DataQualityGenerator(faker) + private val securityAndPermissionsGenerator = SecurityAndPermissionsGenerator() + private val lineageGenerator = LineageGenerator(faker) + private val sqlQueriesGenerator = SqlQueriesGenerator(faker) + private val changeHistoryGenerator = ChangeHistoryGenerator(faker) + private val contactInformationGenerator = ContactInformationGenerator(faker) + private val tagGenerator = TagGenerator(faker) + + def newView(viewName: String, sourceLocation: String, fields: Seq[ViewField]): View = { + val viewOrdinal = viewName.split("_").last.toInt + val columns = fields.map(_.name).map(columnGenerator.apply) + + val containedElements = containedElementsGenerator.apply() + val businessContext = businessContextGenerator.apply() + val technicalDetails = technicalDetailsGenerator.apply(sourceLocation) + val dataQuality = dataQualityGenerator.apply() + val securityAndPermissions = securityAndPermissionsGenerator.apply() + + val upstreamSources: Seq[String] = + nAryTree.pathToRoot(viewOrdinal).map(ordinal => s"NS_$ordinal") + val downstreamSources: Seq[String] = nAryTree.siblingsOf(viewOrdinal).map(i => s"T_${i}") + val lineage = lineageGenerator.apply(upstreamSources, downstreamSources) + + val sqlQueries = sqlQueriesGenerator.apply(viewName, columns) + val changeHistory = changeHistoryGenerator.apply(columns) + val contactInformation = contactInformationGenerator.apply(businessContext, viewName) + val viewTags = tagGenerator.apply() + + View( + name = viewName, + columns = columns, + containedElements = containedElements, + businessContext = businessContext, + details = technicalDetails, + quality = dataQuality, + secAndPerms = securityAndPermissions, + lin = lineage, + sqlQueries = sqlQueries, + changeHistory = changeHistory, + contactInformation = contactInformation, + viewTags = viewTags + ) + } +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/BusinessContext.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/BusinessContext.scala new file mode 100644 index 0000000..c9ccd96 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/BusinessContext.scala @@ -0,0 +1,18 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents the business context of a data entity. + * + * @param owner The team or person who owns this entity + * @param useCases List of use cases for this entity + * @param stakeholders Map of stakeholder teams to their usage descriptions + * @param dataProductName Name of the data product this entity belongs to + * @param dataProductDescription Description of the data product + */ +case class BusinessContext( + owner: String, + useCases: Seq[String], + stakeholders: Map[String, String], + dataProductName: String, + dataProductDescription: String +) diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/ChangeHistory.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/ChangeHistory.scala new file mode 100644 index 0000000..5af04c4 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/ChangeHistory.scala @@ -0,0 +1,12 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents the change history of a data entity. + * + * @param lastUpdated The date when the entity was last updated + * @param changelog A list of changes with dates and descriptions + */ +case class ChangeHistory( + lastUpdated: String, + changelog: List[(String, String)] +) diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Column.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Column.scala new file mode 100644 index 0000000..d74dbbe --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Column.scala @@ -0,0 +1,16 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents a column in a table schema. + * + * @param name The name of the column + * @param dataType The data type of the column + * @param description A description of the column's purpose + * @param constraints Zero or more constraints applied to this column + */ +case class Column( + name: String, + dataType: String, + description: String, + constraints: Seq[String] = Seq.empty +) diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/ContactInformation.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/ContactInformation.scala new file mode 100644 index 0000000..5ad8cf0 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/ContactInformation.scala @@ -0,0 +1,12 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents contact information for a data entity. + * + * @param primaryContact The name and email of the primary contact person + * @param supportChannels Map of support channel names to their details + */ +case class ContactInformation( + primaryContact: String, + supportChannels: Map[String, String] +) diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/DataQuality.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/DataQuality.scala new file mode 100644 index 0000000..74ef8a7 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/DataQuality.scala @@ -0,0 +1,12 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents the data quality information for a data entity. + * + * @param knownIssues List of known data quality issues + * @param metrics Map of metric names to their values + */ +case class DataQuality( + knownIssues: List[String], + metrics: Map[String, String] +) diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/EntityType.scala similarity index 70% copy from benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala copy to benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/EntityType.scala index b392870..30939e1 100644 --- a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/EntityType.scala @@ -17,11 +17,17 @@ * under the License. */ -package org.apache.polaris.benchmarks.parameters +package org.apache.polaris.benchmarks.wiki.entities -case class WorkloadParameters( - createCommits: CreateCommitsParameters, - readTreeDataset: ReadTreeDatasetParameters, - createTreeDataset: CreateTreeDatasetParameters, - readUpdateTreeDataset: ReadUpdateTreeDatasetParameters -) {} +/** + * Enum representing different types of entities in the data catalog. + */ +sealed trait EntityType { + def str: String = this.toString.toLowerCase +} + +object EntityType { + case object TABLE extends EntityType + case object VIEW extends EntityType + case object NAMESPACE extends EntityType +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Lineage.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Lineage.scala new file mode 100644 index 0000000..b67f33f --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Lineage.scala @@ -0,0 +1,12 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents the lineage information for a data entity. + * + * @param upstreamSources Map of upstream source names to their descriptions + * @param downstreamConsumers Map of downstream consumer names to their descriptions + */ +case class Lineage( + upstreamSources: Map[String, String], + downstreamConsumers: Map[String, String] +) diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Namespace.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Namespace.scala new file mode 100644 index 0000000..0628107 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Namespace.scala @@ -0,0 +1,28 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents a namespace with its name and contained elements. + * + * @param name The name of the namespace + * @param containedElements The elements contained within this namespace + * @param businessContext Business context information for this namespace + * @param changeHistory Change history information for this namespace + * @param contactInformation Contact information for this namespace + */ +case class Namespace( + name: String, + containedElements: Seq[String] = Seq.empty, + businessContext: BusinessContext, + changeHistory: ChangeHistory, + contactInformation: ContactInformation +) extends Searchable { + + override def entityType(): EntityType = EntityType.NAMESPACE + override def columns: Seq[Column] = Seq.empty + override def technicalDetails: Option[TechnicalDetails] = None + override def dataQuality: Option[DataQuality] = None + override def securityAndPermissions: Option[SecurityAndPermissions] = None + override def lineage: Option[Lineage] = None + override def sqlExamples: Option[Seq[String]] = None + override def tags: Option[Seq[String]] = None +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Searchable.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Searchable.scala new file mode 100644 index 0000000..7de2e20 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Searchable.scala @@ -0,0 +1,76 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Interface for entities that can be documented in a wiki. + */ +trait Searchable { + + /** + * Returns the type of this entity. + * + * @return The entity type + */ + def entityType(): EntityType + + def typeStr(): String = entityType().str + + /** + * The name of the entity. + */ + def name: String + + /** + * The columns that make up the entity schema, if any. + */ + def columns: Seq[Column] + + /** + * The elements contained within this entity. + */ + def containedElements: Seq[String] + + /** + * Business context information for this entity. + */ + def businessContext: BusinessContext + + /** + * Optional technical details for this entity. + */ + def technicalDetails: Option[TechnicalDetails] + + /** + * Optional data quality information for this entity. + */ + def dataQuality: Option[DataQuality] + + /** + * Optional security and permissions information for this entity. + */ + def securityAndPermissions: Option[SecurityAndPermissions] + + /** + * Optional lineage information for this entity. + */ + def lineage: Option[Lineage] + + /** + * Optional SQL query examples for this entity. + */ + def sqlExamples: Option[Seq[String]] + + /** + * Change history information for this entity. + */ + def changeHistory: ChangeHistory + + /** + * Contact information for this entity. + */ + def contactInformation: ContactInformation + + /** + * Optional tags for this entity. + */ + def tags: Option[Seq[String]] +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/SecurityAndPermissions.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/SecurityAndPermissions.scala new file mode 100644 index 0000000..07448e3 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/SecurityAndPermissions.scala @@ -0,0 +1,14 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents the security and permissions information for a data entity. + * + * @param readAccess List of teams or roles with read access + * @param writeAccess List of teams or roles with write access + * @param dataSensitivity Description of data sensitivity and handling requirements + */ +case class SecurityAndPermissions( + readAccess: List[String], + writeAccess: List[String], + dataSensitivity: String +) diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Table.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Table.scala new file mode 100644 index 0000000..c7b2e7f --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/Table.scala @@ -0,0 +1,38 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents a table with its name and column definitions. + * + * @param name The name of the table + * @param columns The columns that make up the table schema + * @param containedElements The elements contained within this table + * @param businessContext Business context information for this table + * @param details Technical details for this table + * @param quality Data quality information for this table + * @param secAndPerms Security and permissions information for this table + * @param lin Lineage information for this table + * @param changeHistory Change history information for this table + * @param contactInformation Contact information for this table + */ +case class Table( + name: String, + columns: Seq[Column], + containedElements: Seq[String] = Seq.empty, + businessContext: BusinessContext, + details: TechnicalDetails, + quality: DataQuality, + secAndPerms: SecurityAndPermissions, + lin: Lineage, + sqlQueries: Seq[String], + changeHistory: ChangeHistory, + contactInformation: ContactInformation, + tableTags: Seq[String] +) extends Searchable { + override def entityType(): EntityType = EntityType.TABLE + override def technicalDetails: Option[TechnicalDetails] = Some(details) + override def dataQuality: Option[DataQuality] = Some(quality) + override def securityAndPermissions: Option[SecurityAndPermissions] = Some(secAndPerms) + override def lineage: Option[Lineage] = Some(lin) + override def sqlExamples: Option[Seq[String]] = Some(sqlQueries) + override def tags: Option[Seq[String]] = Some(tableTags) +} diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/TechnicalDetails.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/TechnicalDetails.scala new file mode 100644 index 0000000..669d2f1 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/TechnicalDetails.scala @@ -0,0 +1,14 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents the technical details of a data entity. + * + * @param sourceLocation The location where the data is stored + * @param updateFrequency How often the data is updated + * @param retentionPolicy The data retention policy + */ +case class TechnicalDetails( + sourceLocation: String, + updateFrequency: String, + retentionPolicy: String +) diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/View.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/View.scala new file mode 100644 index 0000000..506c426 --- /dev/null +++ b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/wiki/entities/View.scala @@ -0,0 +1,39 @@ +package org.apache.polaris.benchmarks.wiki.entities + +/** + * Represents a view with its name and column definitions. + * + * @param name The name of the view + * @param columns The columns that make up the view schema + * @param containedElements The elements contained within this view + * @param businessContext Business context information for this view + * @param details Technical details for this view + * @param quality Data quality information for this view + * @param secAndPerms Security and permissions information for this view + * @param lin Lineage information for this view + * @param sqlQueries SQL query examples for this view + * @param changeHistory Change history information for this view + * @param contactInformation Contact information for this view + */ +case class View( + name: String, + columns: Seq[Column], + containedElements: Seq[String] = Seq.empty, + businessContext: BusinessContext, + details: TechnicalDetails, + quality: DataQuality, + secAndPerms: SecurityAndPermissions, + lin: Lineage, + sqlQueries: Seq[String], + changeHistory: ChangeHistory, + contactInformation: ContactInformation, + viewTags: Seq[String] +) extends Searchable { + override def entityType(): EntityType = EntityType.VIEW + override def technicalDetails: Option[TechnicalDetails] = Some(details) + override def dataQuality: Option[DataQuality] = Some(quality) + override def securityAndPermissions: Option[SecurityAndPermissions] = Some(secAndPerms) + override def lineage: Option[Lineage] = Some(lin) + override def sqlExamples: Option[Seq[String]] = Some(sqlQueries) + override def tags: Option[Seq[String]] = Some(viewTags) +}