Re: [PR] ci: compare benchmark PRs with main [texera]

via GitHub Fri, 12 Jun 2026 00:28:54 -0700


Yicong-Huang commented on code in PR #5639:
URL: https://github.com/apache/texera/pull/5639#discussion_r3401502269



##########
.github/workflows/benchmarks-pr-comment.yml:
##########
@@ -197,17 +200,369 @@ jobs:
               }
             };
 
+            const readBenchJson = (filePath, suite) => {
+              if (!fs.existsSync(filePath)) return [];
+              if (fs.statSync(filePath).size > MAX_JSON_BYTES) {
+                core.warning(`${filePath} is too large; skipping JSON 
comparison input.`);
+                return [];
+              }
+              try {
+                const parsed = JSON.parse(fs.readFileSync(filePath, "utf8"));
+                if (!Array.isArray(parsed)) return [];
+                return parsed
+                  .map((bench) => ({
+                    suite,
+                    name: String(bench.name || ""),
+                    unit: String(bench.unit || ""),
+                    value: Number(bench.value),
+                  }))
+                  .filter((bench) => bench.name && 
Number.isFinite(bench.value));
+              } catch (e) {
+                core.warning(`failed to parse ${filePath}: ${e.message}`);
+                return [];
+              }
+            };
+
+            const parseCsvRows = (text) => {
+              const rows = text
+                .trim()
+                .split(/\r?\n/)
+                .map((line) => line.split(","));
+              if (rows.length < 2) return [];
+              const header = rows[0].map((h) => h.trim());
+              const idx = (col) => header.indexOf(col);
+              return rows.slice(1).map((row) => ({ row, idx }));
+            };
+
+            const prRowsFromCsv = (text) =>
+              parseCsvRows(text)
+                .map(({ row, idx }) => {
+                  const config = `bs=${row[idx("batch_size")]} 
sw=${row[idx("schema_width")]} sl=${row[idx("string_len")]}`;
+                  const metric = (suite, prefix, unit, value) => ({
+                    suite,
+                    name: `${prefix} / ${config}`,
+                    unit,
+                    value: Number(value),
+                  });
+                  return {
+                    config,
+                    throughput: metric(
+                      "Arrow Flight E2E Throughput",
+                      "throughput",
+                      "tuples/sec",
+                      row[idx("tuples_per_sec")]
+                    ),
+                    mbps: metric("Arrow Flight E2E MB/s", "MB/s", "MB/s", 
row[idx("mb_per_sec")]),
+                    p50: metric("Arrow Flight E2E Latency", "latency p50", 
"us", row[idx("lat_p50_us")]),
+                    p95: metric("Arrow Flight E2E Latency", "latency p95", 
"us", row[idx("lat_p95_us")]),
+                    p99: metric("Arrow Flight E2E Latency", "latency p99", 
"us", row[idx("lat_p99_us")]),
+                  };
+                })
+                .filter((item) =>
+                  [item.throughput, item.mbps, item.p50, item.p95, 
item.p99].every((metric) =>
+                    Number.isFinite(metric.value)
+                  )
+                );
+
+            const prRows = csv ? prRowsFromCsv(csv) : [];
+
+            const bytesPerTuple = (benchName) => {
+              const match = benchName.match(/bs=(\d+)\s+sw=(\d+)\s+sl=(\d+)/);
+              if (!match) return null;
+              return Number(match[2]) * Number(match[3]);
+            };
+
+            const derivedMbpsBench = (throughputBench) => {
+              const bytes = bytesPerTuple(throughputBench.name);
+              if (!bytes) return null;
+              return {
+                name: throughputBench.name.replace(/^throughput/, "MB/s"),
+                unit: "MB/s",
+                value: (Number(throughputBench.value) * bytes) / (1024 * 1024),
+              };
+            };
+
+            const loadMainBaseline = async () => {
+              try {
+                const { data } = await github.rest.repos.getContent({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  path: "dev/bench/data.js",
+                  ref: "gh-pages",
+                });
+                if (Array.isArray(data) || !data.content) {
+                  core.warning("gh-pages/dev/bench/data.js is not a file.");
+                  return null;
+                }
+                const raw = Buffer.from(data.content, data.encoding || 
"base64").toString("utf8");

Review Comment:
   Addressed in `24f742341`: `loadMainBaseline()` now only treats non-file 
responses as not-a-file, uses inline `content` when available, and falls back 
to `download_url` for raw `data.js` content when GitHub omits inline content 
for large files.



##########
.github/workflows/benchmarks-pr-comment.yml:
##########
@@ -197,17 +200,369 @@ jobs:
               }
             };
 
+            const readBenchJson = (filePath, suite) => {
+              if (!fs.existsSync(filePath)) return [];
+              if (fs.statSync(filePath).size > MAX_JSON_BYTES) {
+                core.warning(`${filePath} is too large; skipping JSON 
comparison input.`);

Review Comment:
   Addressed in `24f742341`: removed the unused `readBenchJson()` helper and 
`MAX_JSON_BYTES` constant since the comparison is now driven by the CSV 
artifact plus the gh-pages baseline.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] ci: compare benchmark PRs with main [texera]

Reply via email to