This is an automated email from the ASF dual-hosted git repository.

klesh pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/devlake.git


The following commit(s) were added to refs/heads/main by this push:
     new f4888416e fix(github): create domain accounts for non-committer 
authors (#8886) (#8894)
f4888416e is described below

commit f4888416e400c245c55b4d5643529a64fc3d86f7
Author: Juan Antonio Osorio <[email protected]>
AuthorDate: Fri Jun 12 16:21:22 2026 +0300

    fix(github): create domain accounts for non-committer authors (#8886) 
(#8894)
    
    * fix(github): create domain accounts for non-committer authors (#8886)
    
    ConvertAccounts sourced the domain `accounts` table FROM 
_tool_github_accounts,
    which is only populated for users we collected full profiles for 
(effectively,
    committers). Issue and PR authors who never committed were written into
    _tool_github_repo_accounts but never converted, so issues.creator_id and
    pull_requests.author_id pointed at accounts rows that didn't exist.
    
    Source ConvertAccounts FROM _tool_github_repo_accounts LEFT JOIN
    _tool_github_accounts instead, so every user the repo references gets a 
domain
    account, enriched with profile detail when we have it and login-only 
otherwise.
    The domain id uses the same generator the issue/PR convertors use, so the 
FKs
    line up. Also emit a repo_account for a PR's merged_by user so
    pull_requests.merged_by_id resolves too.
    
    The query stays MySQL/PostgreSQL-agnostic (COALESCE, no backtick quoting,
    parameterized via the dal) and mirrors the join already in
    account_org_collector.go.
    
    Adds the non-committer orphan case to the e2e fixture plus a referential-
    integrity assertion in TestAccountDataFlow. Verified on both MySQL and
    PostgreSQL.
    
    Co-Authored-By: Claude Opus 4.8 (1M context) <[email protected]>
    
    * fix(github): carry _raw_data provenance and guard zero account ids
    
    Review feedback on #8894: the rewritten ConvertAccounts dropped the
    _raw_data fields from converted accounts. Select them with COALESCE,
    preferring the enriched _tool_github_accounts row and falling back to
    the _tool_github_repo_accounts row for non-committers. The e2e fixture
    now carries the _raw_data columns like every other tool fixture, and
    the regenerated snapshot pins the pre-existing provenance for enriched
    accounts plus the issue-extractor provenance for the non-committer
    case.
    
    Also guard PR author and merged-by id generation against zero ids: an
    unmerged PR or a deleted user otherwise yields the domain id
    github:GithubAccount:<conn>:0, the same orphan-FK shape this PR fixes.
    issue_convertor already guards its AuthorId the same way.
    
    Verified with the full github e2e suite on both MySQL and PostgreSQL.
    
    Co-Authored-By: Claude Fable 5 <[email protected]>
    
    ---------
    
    Co-authored-by: Claude Opus 4.8 (1M context) <[email protected]>
---
 backend/plugins/github/e2e/account_test.go         | 33 ++++++++++
 .../e2e/raw_tables/_tool_github_repo_accounts.csv  | 33 +++++-----
 .../plugins/github/e2e/snapshot_tables/account.csv |  1 +
 backend/plugins/github/tasks/account_convertor.go  | 70 +++++++++++++++++-----
 backend/plugins/github/tasks/pr_convertor.go       | 11 +++-
 backend/plugins/github/tasks/pr_extractor.go       | 10 ++++
 6 files changed, 126 insertions(+), 32 deletions(-)

diff --git a/backend/plugins/github/e2e/account_test.go 
b/backend/plugins/github/e2e/account_test.go
index 817ef6fab..7b6466aba 100644
--- a/backend/plugins/github/e2e/account_test.go
+++ b/backend/plugins/github/e2e/account_test.go
@@ -20,11 +20,15 @@ package e2e
 import (
        "testing"
 
+       "github.com/apache/incubator-devlake/core/dal"
        
"github.com/apache/incubator-devlake/core/models/domainlayer/crossdomain"
+       "github.com/apache/incubator-devlake/core/models/domainlayer/didgen"
        "github.com/apache/incubator-devlake/helpers/e2ehelper"
        "github.com/apache/incubator-devlake/plugins/github/impl"
        "github.com/apache/incubator-devlake/plugins/github/models"
        "github.com/apache/incubator-devlake/plugins/github/tasks"
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
 )
 
 func TestAccountDataFlow(t *testing.T) {
@@ -108,4 +112,33 @@ func TestAccountDataFlow(t *testing.T) {
                        "_raw_data_remark",
                },
        )
+
+       // Referential-integrity invariant (#8886): every account the repo 
references in
+       // _tool_github_repo_accounts must have a domain `accounts` row, so 
issues.creator_id /
+       // pull_requests.author_id / merged_by_id never point at a missing 
account. We generate
+       // the domain id with the SAME generator the issue/PR convertors use, 
so this is a
+       // faithful proxy for the FK join the issue reported as broken. It also 
fails loudly if a
+       // future change shrinks ConvertAccounts' coverage or diverges the id 
generation.
+       accountIdGen := didgen.NewDomainIdGenerator(&models.GithubAccount{})
+       var repoAccounts []models.GithubRepoAccount
+       require.NoError(t, dataflowTester.Dal.All(&repoAccounts,
+               dal.Where("repo_github_id = ? AND connection_id = ? AND 
account_id > 0",
+                       taskData.Options.GithubId, 
taskData.Options.ConnectionId),
+       ))
+       require.NotEmpty(t, repoAccounts, "fixture must reference at least one 
account")
+       sawOrphanCase := false
+       for _, ra := range repoAccounts {
+               if ra.Login == "milichev" {
+                       sawOrphanCase = true // the non-committer author from 
the issue repro
+               }
+               domainId := 
accountIdGen.Generate(taskData.Options.ConnectionId, ra.AccountId)
+               count, err := dataflowTester.Dal.Count(
+                       dal.From(&crossdomain.Account{}),
+                       dal.Where("id = ?", domainId),
+               )
+               require.NoError(t, err)
+               assert.Equalf(t, int64(1), count,
+                       "orphan FK: repo account %q (id=%d) has no domain 
accounts row %q", ra.Login, ra.AccountId, domainId)
+       }
+       assert.True(t, sawOrphanCase, "fixture should include the non-committer 
orphan case (milichev)")
 }
diff --git 
a/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv 
b/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv
index 8b8666442..7aa42a4b0 100644
--- a/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv
+++ b/backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv
@@ -1,16 +1,17 @@
-connection_id,account_id,repo_github_id,login
-1,21979,134018330,appleboy
-1,964542,134018330,sarathsp06
-1,1052632,134018330,runner-mei
-1,3794113,134018330,shanhuhai5739
-1,3971390,134018330,ppmoon
-1,7496278,134018330,panjf2000
-1,8518239,134018330,gitter-badger
-1,11763614,2,Moonlight-Zhao
-1,12420699,2,shanghai-Jerry
-1,14950473,2,zqkgo
-1,22429695,2,codecov[bot]
-1,24841832,2,rikewang
-1,31087327,2,chensanle
-1,32893410,2,zhangyuanxue
-1,38849208,2,king526
\ No newline at end of file
+connection_id,account_id,repo_github_id,login,_raw_data_params,_raw_data_table,_raw_data_id,_raw_data_remark
+1,21979,134018330,appleboy,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,8,
+1,964542,134018330,sarathsp06,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,1,
+1,1052632,134018330,runner-mei,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,13,
+1,3794113,134018330,shanhuhai5739,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,2,
+1,3971390,134018330,ppmoon,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,14,
+1,7496278,134018330,panjf2000,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,5,
+1,8518239,134018330,gitter-badger,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,9,
+1,145564,134018330,milichev,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_issues,1,
+1,11763614,2,Moonlight-Zhao,,,0,
+1,12420699,2,shanghai-Jerry,,,0,
+1,14950473,2,zqkgo,,,0,
+1,22429695,2,codecov[bot],,,0,
+1,24841832,2,rikewang,,,0,
+1,31087327,2,chensanle,,,0,
+1,32893410,2,zhangyuanxue,,,0,
+1,38849208,2,king526,,,0,
diff --git a/backend/plugins/github/e2e/snapshot_tables/account.csv 
b/backend/plugins/github/e2e/snapshot_tables/account.csv
index 1092e7a52..c3022a4a2 100644
--- a/backend/plugins/github/e2e/snapshot_tables/account.csv
+++ b/backend/plugins/github/e2e/snapshot_tables/account.csv
@@ -1,5 +1,6 @@
 
id,email,full_name,user_name,avatar_url,organization,_raw_data_params,_raw_data_table,_raw_data_id,_raw_data_remark
 
github:GithubAccount:1:1052632,runner.mei@,runner,runner-mei,https://avatars.githubusercontent.com/u/1052632?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,13,
+github:GithubAccount:1:145564,,,milichev,,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_issues,1,
 github:GithubAccount:1:21979,[email protected],Bo-Yi 
Wu,appleboy,https://avatars.githubusercontent.com/u/21979?v=4,"COSCUP,nodejs-tw,moztw,h5bp,CodeIgniter-TW,drone,Getmore,golangtw,laravel-taiwan,go-xorm,gin-gonic,PHPConf-TW,Mediatek-Cloud,SJFinder,go-gitea,laradock,gin-contrib,tagfans,maintainers,go-training,go-ggz,the-benchmarker,golang-queue","{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,8,
 
github:GithubAccount:1:3794113,[email protected],Derek,shanhuhai5739,https://avatars.githubusercontent.com/u/3794113?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,2,
 
github:GithubAccount:1:3971390,[email protected],ppmoon,ppmoon,https://avatars.githubusercontent.com/u/3971390?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,14,
diff --git a/backend/plugins/github/tasks/account_convertor.go 
b/backend/plugins/github/tasks/account_convertor.go
index e86780d27..3d4db97d1 100644
--- a/backend/plugins/github/tasks/account_convertor.go
+++ b/backend/plugins/github/tasks/account_convertor.go
@@ -22,6 +22,7 @@ import (
 
        "github.com/apache/incubator-devlake/core/dal"
        "github.com/apache/incubator-devlake/core/errors"
+       "github.com/apache/incubator-devlake/core/models/common"
        "github.com/apache/incubator-devlake/core/models/domainlayer"
        
"github.com/apache/incubator-devlake/core/models/domainlayer/crossdomain"
        "github.com/apache/incubator-devlake/core/models/domainlayer/didgen"
@@ -30,6 +31,19 @@ import (
        "github.com/apache/incubator-devlake/plugins/github/models"
 )
 
+// repoAccountForConvert is the row projected by ConvertAccounts' query: every
+// account referenced by the repo (from _tool_github_repo_accounts), enriched
+// with profile detail from _tool_github_accounts when it was collected. The
+// embedded NoPKModel carries the RawDataOrigin across to the domain row.
+type repoAccountForConvert struct {
+       Id        int
+       Login     string
+       Name      string
+       Email     string
+       AvatarUrl string
+       common.NoPKModel
+}
+
 func init() {
        RegisterSubtaskMeta(&ConvertAccountsMeta)
 }
@@ -38,12 +52,12 @@ var ConvertAccountsMeta = plugin.SubTaskMeta{
        Name:             "Convert Users",
        EntryPoint:       ConvertAccounts,
        EnabledByDefault: true,
-       Description:      "Convert tool layer table github_accounts into  
domain layer table accounts",
+       Description:      "Convert every account referenced by the repo (tool 
layer repo_accounts, enriched by github_accounts) into domain layer table 
accounts",
        DomainTypes:      []string{plugin.DOMAIN_TYPE_CROSS},
        DependencyTables: []string{
-               models.GithubAccount{}.TableName(),     // cursor
-               models.GithubRepoAccount{}.TableName(), // cursor
-               models.GithubAccountOrg{}.TableName()}, // account id gen
+               models.GithubRepoAccount{}.TableName(), // cursor (every user 
referenced by the repo)
+               models.GithubAccount{}.TableName(),     // left-join enrichment 
(profile detail, optional)
+               models.GithubAccountOrg{}.TableName()}, // org pluck
        ProductTables: []string{crossdomain.Account{}.TableName()},
 }
 
@@ -53,7 +67,7 @@ func ConvertAccounts(taskCtx plugin.SubTaskContext) 
errors.Error {
 
        accountIdGen := didgen.NewDomainIdGenerator(&models.GithubAccount{})
 
-       converter, err := 
api.NewStatefulDataConverter(&api.StatefulDataConverterArgs[models.GithubAccount]{
+       converter, err := 
api.NewStatefulDataConverter(&api.StatefulDataConverterArgs[repoAccountForConvert]{
                SubtaskCommonArgs: &api.SubtaskCommonArgs{
                        SubTaskContext: taskCtx,
                        Table:          RAW_ACCOUNT_TABLE,
@@ -62,29 +76,57 @@ func ConvertAccounts(taskCtx plugin.SubTaskContext) 
errors.Error {
                                Name:         data.Options.Name,
                        },
                },
+               // Source every account referenced by this repo from 
_tool_github_repo_accounts
+               // (which the issue/PR/commit extractors populate for any 
author, assignee, or
+               // merged-by user), and LEFT JOIN _tool_github_accounts for 
profile detail when it
+               // was collected. This guarantees a domain `accounts` row for 
every CreatorId /
+               // AuthorId the other convertors emit, instead of only for 
users who committed.
+               // Raw-data provenance follows the same rule as the profile 
fields: the enriched
+               // _tool_github_accounts row when we collected one, the 
repo_accounts row otherwise.
+               // Note the consequence: fallback-provenance rows carry a 
_raw_data_table other than
+               // _raw_github_api_accounts, so the batch-save divider's 
full-sync delete-then-reinsert
+               // (keyed on this converter's raw table) never deletes them; 
they are reconciled by
+               // upsert only. Scope deletion still covers them via 
_raw_data_params.
+               // SQL is kept DB-agnostic (no backtick quoting, COALESCE not 
IFNULL) so it runs on
+               // both MySQL and PostgreSQL.
                Input: func(stateManager *api.SubtaskStateManager) (dal.Rows, 
errors.Error) {
                        clauses := []dal.Clause{
-                               dal.Select("_tool_github_accounts.*"),
-                               dal.From(&models.GithubAccount{}),
+                               
dal.Select(`_tool_github_repo_accounts.account_id AS id,
+                                       _tool_github_repo_accounts.login AS 
login,
+                                       COALESCE(ga.name, '') AS name,
+                                       COALESCE(ga.email, '') AS email,
+                                       COALESCE(ga.avatar_url, '') AS 
avatar_url,
+                                       COALESCE(ga._raw_data_params, 
_tool_github_repo_accounts._raw_data_params) AS _raw_data_params,
+                                       COALESCE(ga._raw_data_table, 
_tool_github_repo_accounts._raw_data_table) AS _raw_data_table,
+                                       COALESCE(ga._raw_data_id, 
_tool_github_repo_accounts._raw_data_id) AS _raw_data_id,
+                                       COALESCE(ga._raw_data_remark, 
_tool_github_repo_accounts._raw_data_remark) AS _raw_data_remark`),
+                               dal.From(&models.GithubRepoAccount{}),
+                               dal.Join(`left join _tool_github_accounts ga on 
(
+                                       ga.connection_id = 
_tool_github_repo_accounts.connection_id
+                                       AND ga.id = 
_tool_github_repo_accounts.account_id
+                               )`),
                                dal.Where(
-                                       "repo_github_id = ? and 
_tool_github_accounts.connection_id=?",
+                                       
`_tool_github_repo_accounts.repo_github_id = ?
+                                               AND 
_tool_github_repo_accounts.connection_id = ?
+                                               AND 
_tool_github_repo_accounts.account_id > 0`,
                                        data.Options.GithubId,
                                        data.Options.ConnectionId,
                                ),
-                               dal.Join(`left join _tool_github_repo_accounts 
gra on (
-                                       _tool_github_accounts.connection_id = 
gra.connection_id
-                                       AND _tool_github_accounts.id = 
gra.account_id
-                               )`),
                        }
                        if stateManager.IsIncremental() {
                                since := stateManager.GetSince()
                                if since != nil {
-                                       clauses = append(clauses, 
dal.Where("_tool_github_accounts.updated_at >= ?", since))
+                                       // Incremental cursor intentionally 
tracks _tool_github_repo_accounts.updated_at
+                                       // (repo membership), not 
_tool_github_accounts.updated_at (profile freshness):
+                                       // account-detail re-enrichment is 
reconciled on the next full sync. Do not switch
+                                       // this back to _tool_github_accounts — 
that is what left issue/PR-only authors
+                                       // orphaned (#8886).
+                                       clauses = append(clauses, 
dal.Where("_tool_github_repo_accounts.updated_at >= ?", since))
                                }
                        }
                        return db.Cursor(clauses...)
                },
-               Convert: func(githubUser *models.GithubAccount) ([]interface{}, 
errors.Error) {
+               Convert: func(githubUser *repoAccountForConvert) 
([]interface{}, errors.Error) {
                        // query related orgs
                        var orgs []string
                        err := db.Pluck(`org_login`, &orgs,
diff --git a/backend/plugins/github/tasks/pr_convertor.go 
b/backend/plugins/github/tasks/pr_convertor.go
index 48a40cc4e..c4e27a6b9 100644
--- a/backend/plugins/github/tasks/pr_convertor.go
+++ b/backend/plugins/github/tasks/pr_convertor.go
@@ -87,7 +87,6 @@ func ConvertPullRequests(taskCtx plugin.SubTaskContext) 
errors.Error {
                                OriginalStatus: pr.State,
                                Title:          pr.Title,
                                Url:            pr.Url,
-                               AuthorId:       
accountIdGen.Generate(data.Options.ConnectionId, pr.AuthorId),
                                AuthorName:     pr.AuthorName,
                                Description:    pr.Body,
                                CreatedDate:    pr.GithubCreatedAt,
@@ -104,9 +103,17 @@ func ConvertPullRequests(taskCtx plugin.SubTaskContext) 
errors.Error {
                                Additions:      pr.Additions,
                                Deletions:      pr.Deletions,
                                MergedByName:   pr.MergedByName,
-                               MergedById:     
accountIdGen.Generate(data.Options.ConnectionId, pr.MergedById),
                                IsDraft:        pr.IsDraft,
                        }
+                       // Generate account ids only for real users (#8886): a 
zero AuthorId (deleted
+                       // user) or zero MergedById (unmerged PR) would 
otherwise produce an id like
+                       // github:GithubAccount:1:0 that no accounts row can 
ever match.
+                       if pr.AuthorId != 0 {
+                               domainPr.AuthorId = 
accountIdGen.Generate(data.Options.ConnectionId, pr.AuthorId)
+                       }
+                       if pr.MergedById != 0 {
+                               domainPr.MergedById = 
accountIdGen.Generate(data.Options.ConnectionId, pr.MergedById)
+                       }
                        if pr.State == "open" || pr.State == "OPEN" {
                                domainPr.Status = code.OPEN
                        } else if pr.State == "MERGED" || (pr.State == "closed" 
&& (pr.Merged || pr.MergedAt != nil)) {
diff --git a/backend/plugins/github/tasks/pr_extractor.go 
b/backend/plugins/github/tasks/pr_extractor.go
index 802846f30..9c3f1ca8f 100644
--- a/backend/plugins/github/tasks/pr_extractor.go
+++ b/backend/plugins/github/tasks/pr_extractor.go
@@ -151,6 +151,16 @@ func ExtractApiPullRequests(taskCtx plugin.SubTaskContext) 
errors.Error {
                                githubPr.AuthorName = githubUser.Login
                                githubPr.AuthorId = githubUser.AccountId
                        }
+                       // Emit a repo_account for the merged-by user too, so 
pull_requests.merged_by_id
+                       // resolves to a domain account instead of an orphan FK 
(ConvertAccounts sources
+                       // every referenced user from 
_tool_github_repo_accounts).
+                       if body.MergedBy != nil {
+                               mergedByUser, err := 
convertAccount(body.MergedBy, data.Options.GithubId, data.Options.ConnectionId)
+                               if err != nil {
+                                       return nil, err
+                               }
+                               results = append(results, mergedByUser)
+                       }
                        for _, label := range body.Labels {
                                results = append(results, &models.GithubPrLabel{
                                        ConnectionId: data.Options.ConnectionId,

Reply via email to