This is an automated email from the ASF dual-hosted git repository.
klesh pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/devlake.git
The following commit(s) were added to refs/heads/main by this push:
new f4888416e fix(github): create domain accounts for non-committer
authors (#8886) (#8894)
f4888416e is described below
commit f4888416e400c245c55b4d5643529a64fc3d86f7
Author: Juan Antonio Osorio <[email protected]>
AuthorDate: Fri Jun 12 16:21:22 2026 +0300
fix(github): create domain accounts for non-committer authors (#8886)
(#8894)
* fix(github): create domain accounts for non-committer authors (#8886)
ConvertAccounts sourced the domain `accounts` table FROM
_tool_github_accounts,
which is only populated for users we collected full profiles for
(effectively,
committers). Issue and PR authors who never committed were written into
_tool_github_repo_accounts but never converted, so issues.creator_id and
pull_requests.author_id pointed at accounts rows that didn't exist.
Source ConvertAccounts FROM _tool_github_repo_accounts LEFT JOIN
_tool_github_accounts instead, so every user the repo references gets a
domain
account, enriched with profile detail when we have it and login-only
otherwise.
The domain id uses the same generator the issue/PR convertors use, so the
FKs
line up. Also emit a repo_account for a PR's merged_by user so
pull_requests.merged_by_id resolves too.
The query stays MySQL/PostgreSQL-agnostic (COALESCE, no backtick quoting,
parameterized via the dal) and mirrors the join already in
account_org_collector.go.
Adds the non-committer orphan case to the e2e fixture plus a referential-
integrity assertion in TestAccountDataFlow. Verified on both MySQL and
PostgreSQL.
Co-Authored-By: Claude Opus 4.8 (1M context) <[email protected]>
* fix(github): carry _raw_data provenance and guard zero account ids
Review feedback on #8894: the rewritten ConvertAccounts dropped the
_raw_data fields from converted accounts. Select them with COALESCE,
preferring the enriched _tool_github_accounts row and falling back to
the _tool_github_repo_accounts row for non-committers. The e2e fixture
now carries the _raw_data columns like every other tool fixture, and
the regenerated snapshot pins the pre-existing provenance for enriched
accounts plus the issue-extractor provenance for the non-committer
case.
Also guard PR author and merged-by id generation against zero ids: an
unmerged PR or a deleted user otherwise yields the domain id
github:GithubAccount:<conn>:0, the same orphan-FK shape this PR fixes.
issue_convertor already guards its AuthorId the same way.
Verified with the full github e2e suite on both MySQL and PostgreSQL.
Co-Authored-By: Claude Fable 5 <[email protected]>
---------
Co-authored-by: Claude Opus 4.8 (1M context) <[email protected]>
---
backend/plugins/github/e2e/account_test.go | 33 ++++++++++
.../e2e/raw_tables/_tool_github_repo_accounts.csv | 33 +++++-----
.../plugins/github/e2e/snapshot_tables/account.csv | 1 +
backend/plugins/github/tasks/account_convertor.go | 70 +++++++++++++++++-----
backend/plugins/github/tasks/pr_convertor.go | 11 +++-
backend/plugins/github/tasks/pr_extractor.go | 10 ++++
6 files changed, 126 insertions(+), 32 deletions(-)
diff --git a/backend/plugins/github/e2e/account_test.go
b/backend/plugins/github/e2e/account_test.go
index 817ef6fab..7b6466aba 100644
--- a/backend/plugins/github/e2e/account_test.go
+++ b/backend/plugins/github/e2e/account_test.go
@@ -20,11 +20,15 @@ package e2e
import (
"testing"
+ "github.com/apache/incubator-devlake/core/dal"
"github.com/apache/incubator-devlake/core/models/domainlayer/crossdomain"
+ "github.com/apache/incubator-devlake/core/models/domainlayer/didgen"
"github.com/apache/incubator-devlake/helpers/e2ehelper"
"github.com/apache/incubator-devlake/plugins/github/impl"
"github.com/apache/incubator-devlake/plugins/github/models"
"github.com/apache/incubator-devlake/plugins/github/tasks"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
)
func TestAccountDataFlow(t *testing.T) {
@@ -108,4 +112,33 @@ func TestAccountDataFlow(t *testing.T) {
"_raw_data_remark",
},
)
+
+ // Referential-integrity invariant (#8886): every account the repo
references in
+ // _tool_github_repo_accounts must have a domain `accounts` row, so
issues.creator_id /
+ // pull_requests.author_id / merged_by_id never point at a missing
account. We generate
+ // the domain id with the SAME generator the issue/PR convertors use,
so this is a
+ // faithful proxy for the FK join the issue reported as broken. It also
fails loudly if a
+ // future change shrinks ConvertAccounts' coverage or diverges the id
generation.
+ accountIdGen := didgen.NewDomainIdGenerator(&models.GithubAccount{})
+ var repoAccounts []models.GithubRepoAccount
+ require.NoError(t, dataflowTester.Dal.All(&repoAccounts,
+ dal.Where("repo_github_id = ? AND connection_id = ? AND
account_id > 0",
+ taskData.Options.GithubId,
taskData.Options.ConnectionId),
+ ))
+ require.NotEmpty(t, repoAccounts, "fixture must reference at least one
account")
+ sawOrphanCase := false
+ for _, ra := range repoAccounts {
+ if ra.Login == "milichev" {
+ sawOrphanCase = true // the non-committer author from
the issue repro
+ }
+ domainId :=
accountIdGen.Generate(taskData.Options.ConnectionId, ra.AccountId)
+ count, err := dataflowTester.Dal.Count(
+ dal.From(&crossdomain.Account{}),
+ dal.Where("id = ?", domainId),
+ )
+ require.NoError(t, err)
+ assert.Equalf(t, int64(1), count,
+ "orphan FK: repo account %q (id=%d) has no domain
accounts row %q", ra.Login, ra.AccountId, domainId)
+ }
+ assert.True(t, sawOrphanCase, "fixture should include the non-committer
orphan case (milichev)")
}
diff --git
a/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv
b/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv
index 8b8666442..7aa42a4b0 100644
--- a/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv
+++ b/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv
@@ -1,16 +1,17 @@
-connection_id,account_id,repo_github_id,login
-1,21979,134018330,appleboy
-1,964542,134018330,sarathsp06
-1,1052632,134018330,runner-mei
-1,3794113,134018330,shanhuhai5739
-1,3971390,134018330,ppmoon
-1,7496278,134018330,panjf2000
-1,8518239,134018330,gitter-badger
-1,11763614,2,Moonlight-Zhao
-1,12420699,2,shanghai-Jerry
-1,14950473,2,zqkgo
-1,22429695,2,codecov[bot]
-1,24841832,2,rikewang
-1,31087327,2,chensanle
-1,32893410,2,zhangyuanxue
-1,38849208,2,king526
\ No newline at end of file
+connection_id,account_id,repo_github_id,login,_raw_data_params,_raw_data_table,_raw_data_id,_raw_data_remark
+1,21979,134018330,appleboy,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,8,
+1,964542,134018330,sarathsp06,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,1,
+1,1052632,134018330,runner-mei,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,13,
+1,3794113,134018330,shanhuhai5739,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,2,
+1,3971390,134018330,ppmoon,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,14,
+1,7496278,134018330,panjf2000,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,5,
+1,8518239,134018330,gitter-badger,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,9,
+1,145564,134018330,milichev,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_issues,1,
+1,11763614,2,Moonlight-Zhao,,,0,
+1,12420699,2,shanghai-Jerry,,,0,
+1,14950473,2,zqkgo,,,0,
+1,22429695,2,codecov[bot],,,0,
+1,24841832,2,rikewang,,,0,
+1,31087327,2,chensanle,,,0,
+1,32893410,2,zhangyuanxue,,,0,
+1,38849208,2,king526,,,0,
diff --git a/backend/plugins/github/e2e/snapshot_tables/account.csv
b/backend/plugins/github/e2e/snapshot_tables/account.csv
index 1092e7a52..c3022a4a2 100644
--- a/backend/plugins/github/e2e/snapshot_tables/account.csv
+++ b/backend/plugins/github/e2e/snapshot_tables/account.csv
@@ -1,5 +1,6 @@
id,email,full_name,user_name,avatar_url,organization,_raw_data_params,_raw_data_table,_raw_data_id,_raw_data_remark
github:GithubAccount:1:1052632,runner.mei@,runner,runner-mei,https://avatars.githubusercontent.com/u/1052632?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,13,
+github:GithubAccount:1:145564,,,milichev,,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_issues,1,
github:GithubAccount:1:21979,[email protected],Bo-Yi
Wu,appleboy,https://avatars.githubusercontent.com/u/21979?v=4,"COSCUP,nodejs-tw,moztw,h5bp,CodeIgniter-TW,drone,Getmore,golangtw,laravel-taiwan,go-xorm,gin-gonic,PHPConf-TW,Mediatek-Cloud,SJFinder,go-gitea,laradock,gin-contrib,tagfans,maintainers,go-training,go-ggz,the-benchmarker,golang-queue","{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,8,
github:GithubAccount:1:3794113,[email protected],Derek,shanhuhai5739,https://avatars.githubusercontent.com/u/3794113?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,2,
github:GithubAccount:1:3971390,[email protected],ppmoon,ppmoon,https://avatars.githubusercontent.com/u/3971390?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,14,
diff --git a/backend/plugins/github/tasks/account_convertor.go
b/backend/plugins/github/tasks/account_convertor.go
index e86780d27..3d4db97d1 100644
--- a/backend/plugins/github/tasks/account_convertor.go
+++ b/backend/plugins/github/tasks/account_convertor.go
@@ -22,6 +22,7 @@ import (
"github.com/apache/incubator-devlake/core/dal"
"github.com/apache/incubator-devlake/core/errors"
+ "github.com/apache/incubator-devlake/core/models/common"
"github.com/apache/incubator-devlake/core/models/domainlayer"
"github.com/apache/incubator-devlake/core/models/domainlayer/crossdomain"
"github.com/apache/incubator-devlake/core/models/domainlayer/didgen"
@@ -30,6 +31,19 @@ import (
"github.com/apache/incubator-devlake/plugins/github/models"
)
+// repoAccountForConvert is the row projected by ConvertAccounts' query: every
+// account referenced by the repo (from _tool_github_repo_accounts), enriched
+// with profile detail from _tool_github_accounts when it was collected. The
+// embedded NoPKModel carries the RawDataOrigin across to the domain row.
+type repoAccountForConvert struct {
+ Id int
+ Login string
+ Name string
+ Email string
+ AvatarUrl string
+ common.NoPKModel
+}
+
func init() {
RegisterSubtaskMeta(&ConvertAccountsMeta)
}
@@ -38,12 +52,12 @@ var ConvertAccountsMeta = plugin.SubTaskMeta{
Name: "Convert Users",
EntryPoint: ConvertAccounts,
EnabledByDefault: true,
- Description: "Convert tool layer table github_accounts into
domain layer table accounts",
+ Description: "Convert every account referenced by the repo (tool
layer repo_accounts, enriched by github_accounts) into domain layer table
accounts",
DomainTypes: []string{plugin.DOMAIN_TYPE_CROSS},
DependencyTables: []string{
- models.GithubAccount{}.TableName(), // cursor
- models.GithubRepoAccount{}.TableName(), // cursor
- models.GithubAccountOrg{}.TableName()}, // account id gen
+ models.GithubRepoAccount{}.TableName(), // cursor (every user
referenced by the repo)
+ models.GithubAccount{}.TableName(), // left-join enrichment
(profile detail, optional)
+ models.GithubAccountOrg{}.TableName()}, // org pluck
ProductTables: []string{crossdomain.Account{}.TableName()},
}
@@ -53,7 +67,7 @@ func ConvertAccounts(taskCtx plugin.SubTaskContext)
errors.Error {
accountIdGen := didgen.NewDomainIdGenerator(&models.GithubAccount{})
- converter, err :=
api.NewStatefulDataConverter(&api.StatefulDataConverterArgs[models.GithubAccount]{
+ converter, err :=
api.NewStatefulDataConverter(&api.StatefulDataConverterArgs[repoAccountForConvert]{
SubtaskCommonArgs: &api.SubtaskCommonArgs{
SubTaskContext: taskCtx,
Table: RAW_ACCOUNT_TABLE,
@@ -62,29 +76,57 @@ func ConvertAccounts(taskCtx plugin.SubTaskContext)
errors.Error {
Name: data.Options.Name,
},
},
+ // Source every account referenced by this repo from
_tool_github_repo_accounts
+ // (which the issue/PR/commit extractors populate for any
author, assignee, or
+ // merged-by user), and LEFT JOIN _tool_github_accounts for
profile detail when it
+ // was collected. This guarantees a domain `accounts` row for
every CreatorId /
+ // AuthorId the other convertors emit, instead of only for
users who committed.
+ // Raw-data provenance follows the same rule as the profile
fields: the enriched
+ // _tool_github_accounts row when we collected one, the
repo_accounts row otherwise.
+ // Note the consequence: fallback-provenance rows carry a
_raw_data_table other than
+ // _raw_github_api_accounts, so the batch-save divider's
full-sync delete-then-reinsert
+ // (keyed on this converter's raw table) never deletes them;
they are reconciled by
+ // upsert only. Scope deletion still covers them via
_raw_data_params.
+ // SQL is kept DB-agnostic (no backtick quoting, COALESCE not
IFNULL) so it runs on
+ // both MySQL and PostgreSQL.
Input: func(stateManager *api.SubtaskStateManager) (dal.Rows,
errors.Error) {
clauses := []dal.Clause{
- dal.Select("_tool_github_accounts.*"),
- dal.From(&models.GithubAccount{}),
+
dal.Select(`_tool_github_repo_accounts.account_id AS id,
+ _tool_github_repo_accounts.login AS
login,
+ COALESCE(ga.name, '') AS name,
+ COALESCE(ga.email, '') AS email,
+ COALESCE(ga.avatar_url, '') AS
avatar_url,
+ COALESCE(ga._raw_data_params,
_tool_github_repo_accounts._raw_data_params) AS _raw_data_params,
+ COALESCE(ga._raw_data_table,
_tool_github_repo_accounts._raw_data_table) AS _raw_data_table,
+ COALESCE(ga._raw_data_id,
_tool_github_repo_accounts._raw_data_id) AS _raw_data_id,
+ COALESCE(ga._raw_data_remark,
_tool_github_repo_accounts._raw_data_remark) AS _raw_data_remark`),
+ dal.From(&models.GithubRepoAccount{}),
+ dal.Join(`left join _tool_github_accounts ga on
(
+ ga.connection_id =
_tool_github_repo_accounts.connection_id
+ AND ga.id =
_tool_github_repo_accounts.account_id
+ )`),
dal.Where(
- "repo_github_id = ? and
_tool_github_accounts.connection_id=?",
+
`_tool_github_repo_accounts.repo_github_id = ?
+ AND
_tool_github_repo_accounts.connection_id = ?
+ AND
_tool_github_repo_accounts.account_id > 0`,
data.Options.GithubId,
data.Options.ConnectionId,
),
- dal.Join(`left join _tool_github_repo_accounts
gra on (
- _tool_github_accounts.connection_id =
gra.connection_id
- AND _tool_github_accounts.id =
gra.account_id
- )`),
}
if stateManager.IsIncremental() {
since := stateManager.GetSince()
if since != nil {
- clauses = append(clauses,
dal.Where("_tool_github_accounts.updated_at >= ?", since))
+ // Incremental cursor intentionally
tracks _tool_github_repo_accounts.updated_at
+ // (repo membership), not
_tool_github_accounts.updated_at (profile freshness):
+ // account-detail re-enrichment is
reconciled on the next full sync. Do not switch
+ // this back to _tool_github_accounts —
that is what left issue/PR-only authors
+ // orphaned (#8886).
+ clauses = append(clauses,
dal.Where("_tool_github_repo_accounts.updated_at >= ?", since))
}
}
return db.Cursor(clauses...)
},
- Convert: func(githubUser *models.GithubAccount) ([]interface{},
errors.Error) {
+ Convert: func(githubUser *repoAccountForConvert)
([]interface{}, errors.Error) {
// query related orgs
var orgs []string
err := db.Pluck(`org_login`, &orgs,
diff --git a/backend/plugins/github/tasks/pr_convertor.go
b/backend/plugins/github/tasks/pr_convertor.go
index 48a40cc4e..c4e27a6b9 100644
--- a/backend/plugins/github/tasks/pr_convertor.go
+++ b/backend/plugins/github/tasks/pr_convertor.go
@@ -87,7 +87,6 @@ func ConvertPullRequests(taskCtx plugin.SubTaskContext)
errors.Error {
OriginalStatus: pr.State,
Title: pr.Title,
Url: pr.Url,
- AuthorId:
accountIdGen.Generate(data.Options.ConnectionId, pr.AuthorId),
AuthorName: pr.AuthorName,
Description: pr.Body,
CreatedDate: pr.GithubCreatedAt,
@@ -104,9 +103,17 @@ func ConvertPullRequests(taskCtx plugin.SubTaskContext)
errors.Error {
Additions: pr.Additions,
Deletions: pr.Deletions,
MergedByName: pr.MergedByName,
- MergedById:
accountIdGen.Generate(data.Options.ConnectionId, pr.MergedById),
IsDraft: pr.IsDraft,
}
+ // Generate account ids only for real users (#8886): a
zero AuthorId (deleted
+ // user) or zero MergedById (unmerged PR) would
otherwise produce an id like
+ // github:GithubAccount:1:0 that no accounts row can
ever match.
+ if pr.AuthorId != 0 {
+ domainPr.AuthorId =
accountIdGen.Generate(data.Options.ConnectionId, pr.AuthorId)
+ }
+ if pr.MergedById != 0 {
+ domainPr.MergedById =
accountIdGen.Generate(data.Options.ConnectionId, pr.MergedById)
+ }
if pr.State == "open" || pr.State == "OPEN" {
domainPr.Status = code.OPEN
} else if pr.State == "MERGED" || (pr.State == "closed"
&& (pr.Merged || pr.MergedAt != nil)) {
diff --git a/backend/plugins/github/tasks/pr_extractor.go
b/backend/plugins/github/tasks/pr_extractor.go
index 802846f30..9c3f1ca8f 100644
--- a/backend/plugins/github/tasks/pr_extractor.go
+++ b/backend/plugins/github/tasks/pr_extractor.go
@@ -151,6 +151,16 @@ func ExtractApiPullRequests(taskCtx plugin.SubTaskContext)
errors.Error {
githubPr.AuthorName = githubUser.Login
githubPr.AuthorId = githubUser.AccountId
}
+ // Emit a repo_account for the merged-by user too, so
pull_requests.merged_by_id
+ // resolves to a domain account instead of an orphan FK
(ConvertAccounts sources
+ // every referenced user from
_tool_github_repo_accounts).
+ if body.MergedBy != nil {
+ mergedByUser, err :=
convertAccount(body.MergedBy, data.Options.GithubId, data.Options.ConnectionId)
+ if err != nil {
+ return nil, err
+ }
+ results = append(results, mergedByUser)
+ }
for _, label := range body.Labels {
results = append(results, &models.GithubPrLabel{
ConnectionId: data.Options.ConnectionId,