This is an automated email from the ASF dual-hosted git repository. klesh pushed a commit to branch kw-optimize-git-clone in repository https://gitbox.apache.org/repos/asf/incubator-devlake.git
commit 79950463849c6870b5ac08a42393fa15c94c0f76 Author: Klesh Wong <[email protected]> AuthorDate: Mon Apr 1 18:25:56 2024 +0800 feat(gitextractor): skip commit stat to speed up pipeline --- backend/plugins/gitextractor/gitextractor.go | 60 ++++++++++++++ backend/plugins/gitextractor/impl/impl.go | 27 +++++- backend/plugins/gitextractor/main.go | 95 ---------------------- backend/plugins/gitextractor/parser/repo_gogit.go | 35 ++++---- .../plugins/gitextractor/parser/repo_libgit2.go | 28 +++---- backend/plugins/gitextractor/parser/taskdata.go | 57 +++++++++++++ backend/plugins/gitextractor/tasks/clone.go | 8 +- .../gitextractor/tasks/git_repo_collector.go | 35 +------- 8 files changed, 177 insertions(+), 168 deletions(-) diff --git a/backend/plugins/gitextractor/gitextractor.go b/backend/plugins/gitextractor/gitextractor.go new file mode 100644 index 000000000..5032381aa --- /dev/null +++ b/backend/plugins/gitextractor/gitextractor.go @@ -0,0 +1,60 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to You under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main // must be main for plugin entry point + +import ( + "github.com/apache/incubator-devlake/core/runner" + "github.com/apache/incubator-devlake/plugins/gitextractor/impl" + "github.com/spf13/cobra" +) + +var PluginEntry impl.GitExtractor //nolint + +// standalone mode for debugging +func main() { + cmd := &cobra.Command{Use: "gitextractor"} + url := cmd.Flags().StringP("url", "l", "", "repo url") + repoId := cmd.Flags().StringP("repoId", "i", "", "domain layer repo id") + user := cmd.Flags().StringP("user", "u", "", "username") + password := cmd.Flags().StringP("password", "p", "", "password") + // pk := cmd.Flags().StringP("privateKey", "k", "", "private key file") + // pkPass := cmd.Flags().StringP("privateKeyPassPhrase", "P", "", "passphrase for private key") + proxy := cmd.Flags().StringP("proxy", "x", "", "proxy") + useGoGit := cmd.Flags().BoolP("useGoGit", "g", false, "use go-git instead of libgit2") + skipCommitStat := cmd.Flags().BoolP("skipCommitStat", "S", true, "") + skipCommitFiles := cmd.Flags().BoolP("skipCommitFiles", "F", true, "") + timeAfter := cmd.Flags().StringP("timeAfter", "a", "", "collect data that are created after specified time, ie 2006-01-02T15:04:05Z") + _ = cmd.MarkFlagRequired("url") + _ = cmd.MarkFlagRequired("repoId") + + cmd.Run = func(c *cobra.Command, args []string) { + runner.DirectRun(c, args, PluginEntry, map[string]interface{}{ + "url": *url, + "repoId": *repoId, + "user": *user, + "password": *password, + // "privateKey": * + // "passphrase" + "proxy": *proxy, + "useGoGit": *useGoGit, + "skipCommitStat": skipCommitStat, + "skipCommitFiles": skipCommitFiles, + }, *timeAfter) + } + runner.RunCmd(cmd) +} diff --git a/backend/plugins/gitextractor/impl/impl.go b/backend/plugins/gitextractor/impl/impl.go index d8d5413d4..3f7f409c8 100644 --- a/backend/plugins/gitextractor/impl/impl.go +++ b/backend/plugins/gitextractor/impl/impl.go @@ -22,6 +22,7 @@ import ( "github.com/apache/incubator-devlake/core/errors" "github.com/apache/incubator-devlake/core/plugin" helper "github.com/apache/incubator-devlake/helpers/pluginhelper/api" + "github.com/apache/incubator-devlake/plugins/gitextractor/parser" "github.com/apache/incubator-devlake/plugins/gitextractor/tasks" ) @@ -58,21 +59,41 @@ func (p GitExtractor) SubTaskMetas() []plugin.SubTaskMeta { // PrepareTaskData based on task context and user input options, return data that shared among all subtasks func (p GitExtractor) PrepareTaskData(taskCtx plugin.TaskContext, options map[string]interface{}) (interface{}, errors.Error) { - var op tasks.GitExtractorOptions + log := taskCtx.GetLogger().Nested("gitextractor.PrepareTaskData") + var op parser.GitExtractorOptions if err := helper.Decode(options, &op, nil); err != nil { return nil, err } if err := op.Valid(); err != nil { return nil, err } - taskData := &tasks.GitExtractorTaskData{ + + // commit stat, especially commit files(part of stat) are expensive to collect, so we skip them by default + cfg := taskCtx.GetConfigReader() + loadBool := func(optValue **bool, key string, defValue bool) { + // if user specified the option, use it + if *optValue != nil { + return + } + // or fallback to .env configuration + if cfg.IsSet(key) { + defValue = cfg.GetBool(key) + } + *optValue = &defValue + } + loadBool(&op.SkipCommitStat, "SKIP_COMMIT_STAT", true) + loadBool(&op.SkipCommitFiles, "SKIP_COMMIT_FILES", true) + log.Info("SkipCommitStat: %v", *op.SkipCommitStat) + log.Info("SkipCommitFiles: %v", *op.SkipCommitFiles) + + taskData := &parser.GitExtractorTaskData{ Options: &op, } return taskData, nil } func (p GitExtractor) Close(taskCtx plugin.TaskContext) errors.Error { - if taskData, ok := taskCtx.GetData().(*tasks.GitExtractorTaskData); ok { + if taskData, ok := taskCtx.GetData().(*parser.GitExtractorTaskData); ok { if taskData.GitRepo != nil { if err := taskData.GitRepo.Close(taskCtx.GetContext()); err != nil { return errors.Convert(err) diff --git a/backend/plugins/gitextractor/main.go b/backend/plugins/gitextractor/main.go deleted file mode 100644 index f9cc30490..000000000 --- a/backend/plugins/gitextractor/main.go +++ /dev/null @@ -1,95 +0,0 @@ -/* -Licensed to the Apache Software Foundation (ASF) under one or more -contributor license agreements. See the NOTICE file distributed with -this work for additional information regarding copyright ownership. -The ASF licenses this file to You under the Apache License, Version 2.0 -(the "License"); you may not use this file except in compliance with -the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package main - -import ( - "context" - "flag" - "github.com/apache/incubator-devlake/core/config" - "github.com/apache/incubator-devlake/core/runner" - contextimpl "github.com/apache/incubator-devlake/impls/context" - "github.com/apache/incubator-devlake/impls/dalgorm" - "github.com/apache/incubator-devlake/impls/logruslog" - "github.com/apache/incubator-devlake/plugins/gitextractor/impl" - "github.com/apache/incubator-devlake/plugins/gitextractor/models" - "github.com/apache/incubator-devlake/plugins/gitextractor/store" - "github.com/apache/incubator-devlake/plugins/gitextractor/tasks" -) - -// PluginEntry is a variable exported for Framework to search and load -var PluginEntry impl.GitExtractor //nolint - -func main() { - url := flag.String("url", "", "-url") - proxy := flag.String("proxy", "", "-proxy") - id := flag.String("id", "", "-id") - user := flag.String("user", "", "-user") - password := flag.String("password", "", "-password") - output := flag.String("output", "", "-output") - dbUrl := flag.String("db", "", "-db") - flag.Parse() - cfg := config.GetConfig() - logger := logruslog.Global.Nested("git extractor") - var storage models.Store - var err error - if *url == "" { - panic("url is missing") - } - if *id == "" { - panic("id is missing") - } - db, err := runner.NewGormDb(cfg, logger) - if err != nil { - panic(err) - } - basicRes := contextimpl.NewDefaultBasicRes(cfg, logger, dalgorm.NewDalgorm(db)) - if *output != "" { - storage, err = store.NewCsvStore(*output) - if err != nil { - panic(err) - } - } else if *dbUrl != "" { - cfg.Set("DB_URL", *dbUrl) - } - // If we didn't specify output or dburl, we will use db by default - if storage == nil { - storage = store.NewDatabase(basicRes, *id) - } - defer storage.Close() - ctx := context.Background() - subTaskCtx := contextimpl.NewStandaloneSubTaskContext( - ctx, - basicRes, - "git extractor", - nil, - ) - repo, err := tasks.NewGitRepo(subTaskCtx, logger, storage, &tasks.GitExtractorOptions{ - RepoId: *id, - Url: *url, - User: *user, - Password: *password, - Proxy: *proxy, - }) - if err != nil { - panic(err) - } - defer repo.Close(ctx) - if err = repo.CollectAll(subTaskCtx); err != nil { - panic(err) - } -} diff --git a/backend/plugins/gitextractor/parser/repo_gogit.go b/backend/plugins/gitextractor/parser/repo_gogit.go index 82be9b365..b12c87c43 100644 --- a/backend/plugins/gitextractor/parser/repo_gogit.go +++ b/backend/plugins/gitextractor/parser/repo_gogit.go @@ -22,6 +22,8 @@ import ( "crypto/sha256" "encoding/hex" "fmt" + "regexp" + "github.com/apache/incubator-devlake/core/dal" "github.com/apache/incubator-devlake/core/errors" "github.com/apache/incubator-devlake/core/log" @@ -33,7 +35,6 @@ import ( "github.com/go-git/go-git/v5/plumbing" "github.com/go-git/go-git/v5/plumbing/object" "github.com/go-git/go-git/v5/plumbing/storer" - "regexp" ) type GoGitRepo struct { @@ -268,13 +269,13 @@ func (r *GoGitRepo) getComponentMap(subtaskCtx plugin.SubTaskContext) (map[strin // CollectCommits Collect data from each commit, we can also get the diff line func (r *GoGitRepo) CollectCommits(subtaskCtx plugin.SubTaskContext) (err error) { + taskOpts := subtaskCtx.GetData().(*GitExtractorTaskData).Options // check it first componentMap, err := r.getComponentMap(subtaskCtx) if err != nil { return err } - skipCommitFiles := subtaskCtx.GetConfigReader().GetBool(SkipCommitFiles) repo := r.repo store := r.store @@ -306,19 +307,21 @@ func (r *GoGitRepo) CollectCommits(subtaskCtx plugin.SubTaskContext) (err error) return err } - stats, err := commit.StatsContext(subtaskCtx.GetContext()) - if err != nil { - return err - } else { - for _, stat := range stats { - codeCommit.Additions += stat.Addition - // In some repos, deletion may be zero, which is different from git log --stat. - // It seems go-git doesn't get the correct changes. - // I have run object.DiffTreeWithOptions manually with different diff algorithms, - // but get the same result with StatsContext. - // I cannot reproduce it with another repo. - // A similar issue: https://github.com/go-git/go-git/issues/367 - codeCommit.Deletions += stat.Deletion + if !*taskOpts.SkipCommitStat { + stats, err := commit.StatsContext(subtaskCtx.GetContext()) + if err != nil { + return err + } else { + for _, stat := range stats { + codeCommit.Additions += stat.Addition + // In some repos, deletion may be zero, which is different from git log --stat. + // It seems go-git doesn't get the correct changes. + // I have run object.DiffTreeWithOptions manually with different diff algorithms, + // but get the same result with StatsContext. + // I cannot reproduce it with another repo. + // A similar issue: https://github.com/go-git/go-git/issues/367 + codeCommit.Deletions += stat.Deletion + } } } @@ -335,7 +338,7 @@ func (r *GoGitRepo) CollectCommits(subtaskCtx plugin.SubTaskContext) (err error) if err != nil { return err } - if !skipCommitFiles { + if !*taskOpts.SkipCommitFiles { if err := r.storeDiffCommitFilesComparedToParent(subtaskCtx, componentMap, commit); err != nil { return err } diff --git a/backend/plugins/gitextractor/parser/repo_libgit2.go b/backend/plugins/gitextractor/parser/repo_libgit2.go index c05c2bfcc..2bf2d6037 100644 --- a/backend/plugins/gitextractor/parser/repo_libgit2.go +++ b/backend/plugins/gitextractor/parser/repo_libgit2.go @@ -26,7 +26,6 @@ import ( "sort" "strconv" - "github.com/apache/incubator-devlake/core/config" "github.com/apache/incubator-devlake/core/dal" "github.com/apache/incubator-devlake/core/errors" "github.com/apache/incubator-devlake/core/log" @@ -38,8 +37,6 @@ import ( git "github.com/libgit2/git2go/v33" ) -const SkipCommitFiles = "SKIP_COMMIT_FILES" - var TypeNotMatchError = "the requested type does not match the type in the ODB" type GitRepo struct { @@ -231,6 +228,7 @@ func (r *GitRepo) CollectBranches(subtaskCtx plugin.SubTaskContext) error { // CollectCommits Collect data from each commit, we can also get the diff line func (r *GitRepo) CollectCommits(subtaskCtx plugin.SubTaskContext) error { + taskOpts := subtaskCtx.GetData().(*GitExtractorTaskData).Options opts, err := getDiffOpts() if err != nil { return err @@ -290,13 +288,16 @@ func (r *GitRepo) CollectCommits(subtaskCtx plugin.SubTaskContext) error { if commit.ParentCount() > 0 { parent = commit.Parent(0) } - var stats *git.DiffStats - if stats, err = r.getDiffComparedToParent(c.Sha, commit, parent, opts, componentMap); err != nil { - return err + + if !*taskOpts.SkipCommitStat { + var stats *git.DiffStats + if stats, err = r.getDiffComparedToParent(taskOpts, c.Sha, commit, parent, opts, componentMap); err != nil { + return err + } + r.logger.Debug("state: %#+v\n", stats.Deletions()) + c.Additions += stats.Insertions() + c.Deletions += stats.Deletions() } - r.logger.Debug("state: %#+v\n", stats.Deletions()) - c.Additions += stats.Insertions() - c.Deletions += stats.Deletions() err = r.store.Commits(c) if err != nil { @@ -331,7 +332,7 @@ func (r *GitRepo) storeParentCommits(commitSha string, commit *git.Commit) error return r.store.CommitParents(commitParents) } -func (r *GitRepo) getDiffComparedToParent(commitSha string, commit *git.Commit, parent *git.Commit, opts *git.DiffOptions, componentMap map[string]*regexp.Regexp) (*git.DiffStats, errors.Error) { +func (r *GitRepo) getDiffComparedToParent(taskOpts *GitExtractorOptions, commitSha string, commit *git.Commit, parent *git.Commit, opts *git.DiffOptions, componentMap map[string]*regexp.Regexp) (*git.DiffStats, errors.Error) { var err error var parentTree, tree *git.Tree if parent != nil { @@ -349,12 +350,7 @@ func (r *GitRepo) getDiffComparedToParent(commitSha string, commit *git.Commit, if err != nil { return nil, errors.Convert(err) } - cfg := config.GetConfig() - skipCommitFiles := true - if cfg.IsSet(SkipCommitFiles) { - skipCommitFiles = cfg.GetBool(SkipCommitFiles) - } - if !skipCommitFiles { + if !*taskOpts.SkipCommitFiles { err = r.storeCommitFilesFromDiff(commitSha, diff, componentMap) if err != nil { return nil, errors.Convert(err) diff --git a/backend/plugins/gitextractor/parser/taskdata.go b/backend/plugins/gitextractor/parser/taskdata.go new file mode 100644 index 000000000..fcbdf20af --- /dev/null +++ b/backend/plugins/gitextractor/parser/taskdata.go @@ -0,0 +1,57 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to You under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package parser + +import ( + "strings" + + "github.com/apache/incubator-devlake/core/errors" +) + +type GitExtractorTaskData struct { + Options *GitExtractorOptions + GitRepo RepoCollector +} + +type GitExtractorOptions struct { + RepoId string `json:"repoId" mapstructure:"repoId"` + Name string `json:"name" mapstructure:"name"` + Url string `json:"url" mapstructure:"url"` + User string `json:"user" mapstructure:"user"` + Password string `json:"password" mapstructure:"password"` + PrivateKey string `json:"privateKey" mapstructure:"privateKey"` + Passphrase string `json:"passphrase" mapstructure:"passphrase"` + Proxy string `json:"proxy" mapstructure:"proxy"` + UseGoGit bool `json:"useGoGit" mapstructure:"useGoGit"` + SkipCommitStat *bool `json:"skipCommitStat" mapstructure:"skipCommitStat" comment:"skip all commit stat including added/deleted lines and commit files as well"` + SkipCommitFiles *bool `json:"skipCommitFiles" mapstructure:"skipCommitFiles"` +} + +func (o GitExtractorOptions) Valid() errors.Error { + if o.RepoId == "" { + return errors.BadInput.New("empty repoId") + } + if o.Url == "" { + return errors.BadInput.New("empty url") + } + url := strings.TrimPrefix(o.Url, "ssh://") + if !(strings.HasPrefix(o.Url, "http") || strings.HasPrefix(url, "git@") || strings.HasPrefix(o.Url, "/")) { + return errors.BadInput.New("wrong url") + } + return nil +} diff --git a/backend/plugins/gitextractor/tasks/clone.go b/backend/plugins/gitextractor/tasks/clone.go index 599d4adb6..14feadbbe 100644 --- a/backend/plugins/gitextractor/tasks/clone.go +++ b/backend/plugins/gitextractor/tasks/clone.go @@ -41,7 +41,7 @@ var CloneGitRepoMeta = plugin.SubTaskMeta{ ForceRunOnResume: true, } -func useGoGit(subTaskCtx plugin.SubTaskContext, taskData *GitExtractorTaskData) bool { +func useGoGit(subTaskCtx plugin.SubTaskContext, taskData *parser.GitExtractorTaskData) bool { if subTaskCtx.GetConfigReader().GetBool(useGoGitInGitExtractor) { return true } @@ -52,7 +52,7 @@ func useGoGit(subTaskCtx plugin.SubTaskContext, taskData *GitExtractorTaskData) } func CloneGitRepo(subTaskCtx plugin.SubTaskContext) errors.Error { - taskData, ok := subTaskCtx.GetData().(*GitExtractorTaskData) + taskData, ok := subTaskCtx.GetData().(*parser.GitExtractorTaskData) if !ok { panic("git repo reference not found on context") } @@ -77,7 +77,7 @@ func CloneGitRepo(subTaskCtx plugin.SubTaskContext) errors.Error { } // NewGitRepo create and return a new parser git repo -func NewGitRepo(ctx plugin.SubTaskContext, logger log.Logger, storage models.Store, op *GitExtractorOptions) (parser.RepoCollector, errors.Error) { +func NewGitRepo(ctx plugin.SubTaskContext, logger log.Logger, storage models.Store, op *parser.GitExtractorOptions) (parser.RepoCollector, errors.Error) { var err errors.Error var repo parser.RepoCollector p := parser.NewGitRepoCreator(storage, logger) @@ -94,7 +94,7 @@ func NewGitRepo(ctx plugin.SubTaskContext, logger log.Logger, storage models.Sto } // NewGoGitRepo create and return a new parser git repo with go-git -func NewGoGitRepo(ctx plugin.SubTaskContext, logger log.Logger, storage models.Store, op *GitExtractorOptions) (parser.RepoCollector, errors.Error) { +func NewGoGitRepo(ctx plugin.SubTaskContext, logger log.Logger, storage models.Store, op *parser.GitExtractorOptions) (parser.RepoCollector, errors.Error) { var err errors.Error var repo parser.RepoCollector p := parser.NewGitRepoCreator(storage, logger) diff --git a/backend/plugins/gitextractor/tasks/git_repo_collector.go b/backend/plugins/gitextractor/tasks/git_repo_collector.go index 00abf28cf..c8a287b36 100644 --- a/backend/plugins/gitextractor/tasks/git_repo_collector.go +++ b/backend/plugins/gitextractor/tasks/git_repo_collector.go @@ -18,44 +18,11 @@ limitations under the License. package tasks import ( - "strings" - "github.com/apache/incubator-devlake/core/errors" "github.com/apache/incubator-devlake/core/plugin" "github.com/apache/incubator-devlake/plugins/gitextractor/parser" ) -type GitExtractorTaskData struct { - Options *GitExtractorOptions - GitRepo parser.RepoCollector -} - -type GitExtractorOptions struct { - RepoId string `json:"repoId"` - Name string `json:"name"` - Url string `json:"url"` - User string `json:"user"` - Password string `json:"password"` - PrivateKey string `json:"privateKey"` - Passphrase string `json:"passphrase"` - Proxy string `json:"proxy"` - UseGoGit bool `json:"use_go_git"` -} - -func (o GitExtractorOptions) Valid() errors.Error { - if o.RepoId == "" { - return errors.BadInput.New("empty repoId") - } - if o.Url == "" { - return errors.BadInput.New("empty url") - } - url := strings.TrimPrefix(o.Url, "ssh://") - if !(strings.HasPrefix(o.Url, "http") || strings.HasPrefix(url, "git@") || strings.HasPrefix(o.Url, "/")) { - return errors.BadInput.New("wrong url") - } - return nil -} - func CollectGitCommits(subTaskCtx plugin.SubTaskContext) errors.Error { repo := getGitRepo(subTaskCtx) if count, err := repo.CountCommits(subTaskCtx.GetContext()); err != nil { @@ -105,7 +72,7 @@ func CollectGitDiffLines(subTaskCtx plugin.SubTaskContext) errors.Error { } func getGitRepo(subTaskCtx plugin.SubTaskContext) parser.RepoCollector { - taskData, ok := subTaskCtx.GetData().(*GitExtractorTaskData) + taskData, ok := subTaskCtx.GetData().(*parser.GitExtractorTaskData) if !ok { panic("git repo reference not found on context") }
