On Wed, Mar 30, 2011 at 06:25:12PM -0300, Guilherme Salgado wrote: > On Wed, 2011-03-30 at 12:10 +0800, Jeremy Kerr wrote: > > Hi Guilherme, > > > > > apps/patchwork/models.py | 1 + > > > lib/sql/migration/008-project-source-tree.sql | 3 +++ > > > 2 files changed, 4 insertions(+), 0 deletions(-) > > > create mode 100644 lib/sql/migration/008-project-source-tree.sql > > > > Looks good. I'd like to wait until there is a user of this field before > > merging the change though - or are you using this for linaro-internal > > things? > > I'm using it on a script I'm writing to fetch the git history of every > project and scan that looking for patches that have been committed. Just > like the existing patchwork-update-commits script does but this one is > fully automated, and to make it more easily testable I'm experimenting > with python-dulwich to scan the git history. > > I haven't submitted it for feedback yet because it's still very much a > work in progress, but I would certainly do so. (I'm including what I > have so far here, though) > > Oh, there's one extra DB schema change I'm considering... That will > allow us to keep track of the last seen commit ref so that we can scan > the git history incrementally. I've added it as another field to > Project, named last_seen_commit_ref. How does that sound to you? > > > -- > Guilherme Salgado <https://launchpad.net/~salgado>
> commit dd10a245c50f651b38e7074d49413e91b2d82e14 > Author: Guilherme Salgado <[email protected]> > Date: Fri Mar 25 15:59:28 2011 -0300 > > Adds a script which goes through all registered projects looking for > patches that have been committed already > > It does that by checking out the project's source code from its VCS of > choice > (currently only git is supported, though), scanning the commits there and > comparing them to the patches in Patchwork. > > diff --git a/apps/patchwork/bin/update-committed-patches.py > b/apps/patchwork/bin/update-committed-patches.py > new file mode 100755 > index 0000000..a444783 > --- /dev/null > +++ b/apps/patchwork/bin/update-committed-patches.py > @@ -0,0 +1,35 @@ > +#!/usr/bin/python > + > +import _pythonpath > +from patchwork.models import Patch, Project, State > +from patchwork.utils import ( > + ensure_source_checkout_for_project, get_hashes_for_commits) > + > + > +#for project in Project.objects.all(): > +for project in Project.objects.filter(linkname='linux-kernel'): > + if project.source_tree is None: > + continue > + > + repo = ensure_source_checkout_for_project(project) > + if repo is None: > + print ("Skipping %s as we couldn't get a source checkout" % > + project.name) > + continue > + > + hashes = get_hashes_for_commits(repo, > + stop_at=project.last_seen_commit_ref) > + for commit_id, patch_hash in hashes: > + # There may be multiple patches with the same hash. That's usually > + # the case when a second version of a patch series is submitted > + # and some of the patches in the series are identical in both > + # series. > + for patch in Patch.objects.filter(project=project, hash=patch_hash): > + patch.state = State.objects.get(name='Accepted') > + patch.commit_ref = commit_id > + print patch, patch.state > + else: > + print "No new %s commits to parse" % project.name > + > + project.last_seen_commit_ref = repo.head() > + project.save() > diff --git a/apps/patchwork/tests/__init__.py > b/apps/patchwork/tests/__init__.py > index 68fe563..e79331b 100644 > --- a/apps/patchwork/tests/__init__.py > +++ b/apps/patchwork/tests/__init__.py > @@ -23,3 +23,4 @@ from patchwork.tests.bundles import * > from patchwork.tests.mboxviews import * > from patchwork.tests.updates import * > from patchwork.tests.filters import * > +from patchwork.tests.test_utils import * > diff --git a/apps/patchwork/tests/test_utils.py > b/apps/patchwork/tests/test_utils.py > new file mode 100644 > index 0000000..0c60e74 > --- /dev/null > +++ b/apps/patchwork/tests/test_utils.py > @@ -0,0 +1,74 @@ > + > +import tempfile > +from time import time > +from unittest import TestCase > + > +from dulwich.objects import Blob, Commit, parse_timezone, Tree > +from dulwich.repo import Repo > + > +from patchwork.utils import get_hashes_for_commits > + > + > +class TestGetHashesForCommits(TestCase): > + > + def test_one_commit(self): > + repo = self.create_git_repo() > + commit = self.add_file_and_commit(repo, 'foo', 'Content1') > + # Here there are no hashes because get_hashes_for_commits() skips the > + # first one as it's unlikely to be of any interest to us. > + self.assertEqual( > + [], list(get_hashes_for_commits(repo, stop_at=None))) > + > + def test_two_commits(self): > + repo = self.create_git_repo() > + commit = self.add_file_and_commit(repo, 'foo', 'Content1') > + commit2 = self.add_file_and_commit(repo, 'bar', 'Content2', commit) > + self.assertEqual( > + [(commit2.id, '5c010402c5673981ee3e1712e6a037de3ff9cae4')], > + list(get_hashes_for_commits(repo, stop_at=None))) > + > + def test_empty_patch(self): > + repo = self.create_git_repo() > + commit = self.add_file_and_commit(repo, 'foo', 'Content1') > + head = self.add_file_and_commit(repo, 'bar', '', commit) > + self.assertEqual( > + [], list(get_hashes_for_commits(repo, stop_at=None))) > + > + def test_stop_at(self): > + repo = self.create_git_repo() > + commit = self.add_file_and_commit(repo, 'foo', 'Content1') > + commit2 = self.add_file_and_commit(repo, 'bar', 'Content2', commit) > + commit3 = self.add_file_and_commit(repo, 'baz', 'Content3', commit2) > + self.assertEqual( > + [(commit3.id, '11d22fa0986b3bb341baa76b8a6a757a46a2f916')], > + list(get_hashes_for_commits(repo, stop_at=commit2.id))) > + > + def create_git_repo(self): > + tmpdir = tempfile.mkdtemp() > + repo = Repo.init(tmpdir) > + return repo > + > + def add_file_and_commit(self, repo, filename, data, parent=None): > + blob = Blob.from_string(data) > + parents = [] > + tree = Tree() > + if parent is not None: > + tree = repo[parent.tree] > + parents = [parent.id] > + tree.add(0100644, filename, blob.id) > + commit = Commit() > + commit.tree = tree.id > + author = 'You <[email protected]>' > + commit.author = commit.committer = author > + commit.commit_time = commit.author_time = int(time()) > + tz = parse_timezone('-0200')[0] > + commit.commit_timezone = commit.author_timezone = tz > + commit.encoding = "UTF-8" > + commit.message = "A commit" > + commit.parents = parents > + object_store = repo.object_store > + object_store.add_object(blob) > + object_store.add_object(tree) > + object_store.add_object(commit) > + repo.refs['refs/heads/master'] = commit.id > + return commit > diff --git a/apps/patchwork/utils.py b/apps/patchwork/utils.py > index e41ffb6..353147e 100644 > --- a/apps/patchwork/utils.py > +++ b/apps/patchwork/utils.py > @@ -17,8 +17,15 @@ > # along with Patchwork; if not, write to the Free Software > # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > > +import os > +from StringIO import StringIO > > -from patchwork.models import Bundle, Project, BundlePatch > +from dulwich.client import get_transport_and_path > +from dulwich.patch import write_tree_diff > +from dulwich.repo import Repo > + > +from patchwork.parser import hash_patch, parse_patch > +from patchwork.models import Bundle, BundlePatch > from django.shortcuts import get_object_or_404 > > def get_patch_ids(d, prefix = 'patch_id'): > @@ -137,3 +144,64 @@ def set_bundle(user, project, action, data, patches, > context): > bundle.save() > > return [] > + > + > +def ensure_source_checkout_for_project(project): > + forest = '/home/salgado/src' # This is where we store the trees we > checkout > + root = os.path.join(forest, project.linkname) > + if not os.path.exists(root): > + repo = Repo.init(root, mkdir=True) > + else: > + repo = Repo(root) > + > + transport, path = get_transport_and_path(project.source_tree) > + refs = transport.fetch(path, repo) > + # XXX: Is this the appropriate thing to do? will there always be a > master > + # branch? > + repo.refs['refs/heads/master'] = refs['HEAD'] > + return repo > + > + > +def get_hashes_for_commits(repo, stop_at): > + # We don't care about the first commit, but if needed it should be > + # possible to diff it against an empty tree and yield its hash as well. > + commit = repo['HEAD'] > + > + while len(commit.parents) > 0: > + commit_id = commit.id > + if commit_id == stop_at: > + break > + > + parent = repo[commit.parents[0]] What about possible other parents? I would say every merged branch has to be inspected until the merge-base of the immediate predecessors -- maybe recursion works here. I forgot to mention Git-Python in the other mail. It has a wrapper to execute git commands with python syntax. That can facilitate such things. https://github.com/gitpython-developers/GitPython > + diff = StringIO() > + # In the case of merges, this won't have the same behavior as 'git > + # show', which seems to omit files not changed since any of the > + # parents (thanks to 'git tree-diff --cc'), but I think this is not a > + # big deal as patches in Patchwork would never be identical to the > + # diff of a merge anyway, would they? > + try: > + write_tree_diff( > + diff, repo.object_store, parent.tree, commit.tree) > + except KeyError, e: > + # XXX: This happens in qemu because there's a commit that is > + # actually on a submodule > + # (8b06c62ae48b67b320f7420dcd4854c5559e1532) and the old > commit_id > + # used for the submodule > + # (06d0bdd9e2e20377b3180e4986b14c8549b393e4) is gone (possibly > + # because of a rebase?), so dulwich crashes with a KeyError. > + raise > + > + commit = parent > + diff.seek(0) > + try: > + patch, _ = parse_patch(diff.read().decode('utf-8')) > + except UnicodeDecodeError: > + # TODO: Need to find out why this happens and see if we really > + # need to skip such commits. > + print "Skipping %s" % commit.id > + continue > + > + # When commits just add files or change permissions the diff will be > + # empty and thus parse_patch() will return None. > + if patch is not None: > + yield commit_id, hash_patch(patch).hexdigest() -- Cheers, Dirk _______________________________________________ Patchwork mailing list [email protected] https://lists.ozlabs.org/listinfo/patchwork
