hezyin commented on code in PR #2050:
URL: https://github.com/apache/incubator-devlake/pull/2050#discussion_r894179711
##########
plugins/gitextractor/parser/repo.go:
##########
@@ -0,0 +1,276 @@
+package parser
+
+import (
+ "fmt"
+ "github.com/apache/incubator-devlake/models/domainlayer"
+ "github.com/apache/incubator-devlake/models/domainlayer/code"
+ "github.com/apache/incubator-devlake/plugins/core"
+ git "github.com/libgit2/git2go/v33"
+)
+
+type GitRepo struct {
+ *LibGit2
+ id string
+ repo *git.Repository
+ cleanup func()
+}
+
+func (r *GitRepo) CollectAll(subtaskCtx core.SubTaskContext) error {
+ r.taskCtx.SetProgress(0, -1)
+ err := r.CollectTags(subtaskCtx)
+ if err != nil {
+ return err
+ }
+ err = r.CollectBranches(subtaskCtx)
+ if err != nil {
+ return err
+ }
+ return r.CollectCommits(subtaskCtx)
+}
+
+func (r *GitRepo) Close() {
+ r.store.Close()
+ if r.cleanup != nil {
+ r.cleanup()
+ }
+}
+
+func (r *GitRepo) CollectTags(subtaskCtx core.SubTaskContext) error {
+ return r.repo.Tags.Foreach(func(name string, id *git.Oid) error {
+ select {
+ case <-r.ctx.Done():
+ return r.ctx.Err()
+ default:
+ break
+ }
+ var err1 error
+ var tag *git.Tag
+ var tagCommit string
+ tag, _ = r.repo.LookupTag(id)
+ if tag != nil {
+ tagCommit = tag.TargetId().String()
+ } else {
+ tagCommit = id.String()
+ }
+ r.logger.Info("tagCommit", tagCommit)
+ if tagCommit != "" {
+ ref := &code.Ref{
+ DomainEntity: domainlayer.DomainEntity{Id:
fmt.Sprintf("%s:%s", r.id, name)},
+ RepoId: r.id,
+ Name: name,
+ CommitSha: tagCommit,
+ RefType: TAG,
+ }
+ err1 = r.store.Refs(ref)
+ if err1 != nil {
+ return err1
+ }
+ r.taskCtx.IncProgress(1)
+ }
+ return nil
+ })
+}
+
+func (r *GitRepo) CollectBranches(subtaskCtx core.SubTaskContext) error {
+ var repoInter *git.BranchIterator
+ repoInter, err := r.repo.NewBranchIterator(git.BranchAll)
+ if err != nil {
+ return err
+ }
+ return repoInter.ForEach(func(branch *git.Branch, branchType
git.BranchType) error {
+ select {
+ case <-r.ctx.Done():
+ return r.ctx.Err()
+ default:
+ break
+ }
+ if branch.IsBranch() {
+ name, err1 := branch.Name()
+ if err1 != nil {
+ return err1
+ }
+ var sha string
+ if oid := branch.Target(); oid != nil {
+ sha = oid.String()
+ }
+ ref := &code.Ref{
+ DomainEntity: domainlayer.DomainEntity{Id:
fmt.Sprintf("%s:%s", r.id, name)},
+ RepoId: r.id,
+ Name: name,
+ CommitSha: sha,
+ RefType: BRANCH,
+ }
+ ref.IsDefault, _ = branch.IsHead()
+ err1 = r.store.Refs(ref)
+ if err1 != nil {
+ return err1
+ }
+ subtaskCtx.IncProgress(1)
+ return nil
+ }
+ return nil
+ })
+}
+
+func (r *GitRepo) CollectCommits(subtaskCtx core.SubTaskContext) error {
+ opts, err := getDiffOpts()
+ if err != nil {
+ return err
+ }
+ odb, err := r.repo.Odb()
+ if err != nil {
+ return err
+ }
+ return odb.ForEach(func(id *git.Oid) error {
+ select {
+ case <-r.ctx.Done():
+ return r.ctx.Err()
+ default:
+ break
+ }
+ commit, _ := r.repo.LookupCommit(id)
+ if commit == nil {
+ return nil
+ }
+ commitSha := commit.Id().String()
+ r.logger.Debug("process commit: %s", commitSha)
+ c := &code.Commit{
+ Sha: commitSha,
+ Message: commit.Message(),
+ }
+ author := commit.Author()
+ if author != nil {
+ c.AuthorName = author.Name
+ c.AuthorEmail = author.Email
+ c.AuthorId = author.Email
+ c.AuthoredDate = author.When
+ }
+ committer := commit.Committer()
+ if committer != nil {
+ c.CommitterName = committer.Name
+ c.CommitterEmail = committer.Email
+ c.CommitterId = committer.Email
+ c.CommittedDate = committer.When
+ }
+ if err != r.storeParentCommits(commitSha, commit) {
+ return err
+ }
+ if commit.ParentCount() > 0 {
+ parent := commit.Parent(0)
+ if parent != nil {
+ var stats *git.DiffStats
+ if stats, err =
r.getDiffComparedToParent(c.Sha, commit, parent, opts); err != nil {
+ return err
+ } else {
+ c.Additions += stats.Insertions()
+ c.Deletions += stats.Deletions()
+ }
+ }
+ }
+ err = r.store.Commits(c)
+ if err != nil {
+ return err
+ }
+ repoCommit := &code.RepoCommit{
+ RepoId: r.id,
+ CommitSha: c.Sha,
+ }
+ err = r.store.RepoCommits(repoCommit)
+ if err != nil {
+ return err
+ }
+ subtaskCtx.IncProgress(1)
+ return nil
+ })
+}
+
+func (r *GitRepo) storeParentCommits(commitSha string, commit *git.Commit)
error {
+ var commitParents []*code.CommitParent
+ for i := uint(0); i < commit.ParentCount(); i++ {
+ parent := commit.Parent(i)
+ if parent != nil {
+ if parentId := parent.Id(); parentId != nil {
+ commitParents = append(commitParents,
&code.CommitParent{
+ CommitSha: commitSha,
+ ParentCommitSha: parentId.String(),
+ })
+ }
+ }
+ }
+ return r.store.CommitParents(commitParents)
+}
+
+func (r *GitRepo) getDiffComparedToParent(commitSha string, commit
*git.Commit, parent *git.Commit, opts *git.DiffOptions) (*git.DiffStats, error)
{
+ var err error
+ var parentTree, tree *git.Tree
+ parentTree, err = parent.Tree()
+ if err != nil {
+ return nil, err
+ }
+ tree, err = commit.Tree()
+ if err != nil {
+ return nil, err
+ }
+ var diff *git.Diff
+ diff, err = r.repo.DiffTreeToTree(parentTree, tree, opts)
+ if err != nil {
+ return nil, err
+ }
+ err = r.storeCommitFileFromDiff(commitSha, diff)
+ if err != nil {
+ return nil, err
+ }
+ var stats *git.DiffStats
+ stats, err = diff.Stats()
+ if err != nil {
+ return nil, err
+ }
+ return stats, nil
+}
+
+func (r *GitRepo) storeCommitFileFromDiff(commitSha string, diff *git.Diff)
error {
Review Comment:
Since we're storing multiple `code.CommitFile`, maybe
`storeCommitFilesFromDiff` is a more appropriate name.
##########
plugins/gitextractor/gitextractor.go:
##########
@@ -35,7 +39,9 @@ func (plugin GitExtractor) Description() string {
// return all available subtasks, framework will run them for you in order
func (plugin GitExtractor) SubTaskMetas() []core.SubTaskMeta {
return []core.SubTaskMeta{
- tasks.CollectGitRepoMeta,
+ tasks.CollectGitCommitRepoMeta,
Review Comment:
We can remove the "Repo" from our new subtask names, e.g.,
`CollectGitCommitRepoMeta` -> `CollectGitCommitMeta`.
##########
plugins/gitextractor/parser/libgit2.go:
##########
@@ -35,247 +31,34 @@ const (
)
type LibGit2 struct {
Review Comment:
I wonder if we still need the `LibGit2` struct. Its fields aren't actively
used by its methods. Maybe we can make those methods simply function without
`LibGit2` as their receivers and move useful fields into `GitRepo` struct
directly. Let me know if you think we still need `LibGit2` struct.
##########
plugins/gitextractor/tasks/git_repo_collector.go:
##########
@@ -53,28 +52,49 @@ func (o GitExtractorOptions) Valid() error {
return nil
}
-func CollectGitRepo(subTaskCtx core.SubTaskContext) error {
- var err error
- db := subTaskCtx.GetDb()
- storage := store.NewDatabase(db, subTaskCtx.GetLogger())
- op := subTaskCtx.GetData().(GitExtractorOptions)
- p := parser.NewLibGit2(storage, subTaskCtx)
- if strings.HasPrefix(op.Url, "http") {
- err = p.CloneOverHTTP(op.RepoId, op.Url, op.User, op.Password,
op.Proxy)
- } else if url := strings.TrimPrefix(op.Url, "ssh://");
strings.HasPrefix(url, "git@") {
- err = p.CloneOverSSH(op.RepoId, url, op.PrivateKey,
op.Passphrase)
- } else if strings.HasPrefix(op.Url, "/") {
- err = p.LocalRepo(op.Url, op.RepoId)
- }
- if err != nil {
- return err
+func CollectGitCommits(subTaskCtx core.SubTaskContext) error {
+ repo := getGitRepo(subTaskCtx)
+ subTaskCtx.SetProgress(0, -1)
Review Comment:
Here we may have an opportunity to improve progress reporting for git
extractor plugin. IIRC, we set the progress to `-1` when we don't know how many
total items need to be processed. If we were able to figure out how many
commits/branches/tags are there, then maybe we can set progress to that number
before the collection process begins, and increments the progress by 1 in each
iteration.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]