contrib/convert-repo
author Stephen Darnell
Wed, 14 Sep 2005 12:22:20 -0500
changeset 1241 3b4f05ff3130
parent 1237 227cfbe34109
child 1335 bea6356b8bca
permissions -rwxr-xr-x
Add support for cloning with hardlinks on windows. In order to use hardlinks, the win32file module is needed, and this is present in ActivePython. If it isn't present, or hardlinks are not supported on the underlying filesystem, a regular copy is used. When using hardlinks the biggest benefit is probably the saving in space, but cloning can be much quicker. For example cloning the Xen tree (non trivial) without an update goes from about 95s to 15s. Unix-like platforms should be unaffected, although should be more tolerant on filesystems that don't support hard links. (tweaked by mpm to deal with new copyfiles function) --- hg.orig/mercurial/commands.py 2005-09-13 19:32:53.000000000 -0500 +++ hg/mercurial/commands.py 2005-09-14 12:11:34.000000000 -0500 @@ -620,10 +620,6 @@ def clone(ui, source, dest=None, **opts) if other.dev() != -1: abspath = os.path.abspath(source) - copyfile = (os.stat(dest).st_dev == other.dev() - and getattr(os, 'link', None) or shutil.copy2) - if copyfile is not shutil.copy2: - ui.note("cloning by hardlink\n") # we use a lock here because if we race with commit, we can # end up with extra data in the cloned revlogs that's not @@ -638,7 +634,7 @@ def clone(ui, source, dest=None, **opts) for f in files.split(): src = os.path.join(source, ".hg", f) dst = os.path.join(dest, ".hg", f) - util.copyfiles(src, dst, copyfile) + util.copyfiles(src, dst) repo = hg.repository(ui, dest) Index: hg/mercurial/util.py =================================================================== --- hg.orig/mercurial/util.py 2005-09-08 00:15:25.000000000 -0500 +++ hg/mercurial/util.py 2005-09-14 12:16:49.000000000 -0500 @@ -12,7 +12,7 @@ platform-specific details from the core. import os, errno from demandload import * -demandload(globals(), "re cStringIO") +demandload(globals(), "re cStringIO shutil") def binary(s): """return true if a string is binary data using diff's heuristic""" @@ -217,17 +217,28 @@ def rename(src, dst): os.unlink(dst) os.rename(src, dst) -def copyfiles(src, dst, copyfile): - """Copy a directory tree, files are copied using 'copyfile'.""" +def copyfiles(src, dst, hardlink=None): + """Copy a directory tree using hardlinks if possible""" + + if hardlink is None: + hardlink = (os.stat(src).st_dev == + os.stat(os.path.dirname(dst)).st_dev) if os.path.isdir(src): os.mkdir(dst) for name in os.listdir(src): srcname = os.path.join(src, name) dstname = os.path.join(dst, name) - copyfiles(srcname, dstname, copyfile) + copyfiles(srcname, dstname, hardlink) else: - copyfile(src, dst) + if hardlink: + try: + os_link(src, dst) + except: + hardlink = False + shutil.copy2(src, dst) + else: + shutil.copy2(src, dst) def opener(base): """ @@ -244,13 +255,13 @@ def opener(base): if mode[0] != "r": try: - s = os.stat(f) + nlink = nlinks(f) except OSError: d = os.path.dirname(f) if not os.path.isdir(d): os.makedirs(d) else: - if s.st_nlink > 1: + if nlink > 1: file(f + ".tmp", "wb").write(file(f, "rb").read()) rename(f+".tmp", f) @@ -266,10 +277,41 @@ def _makelock_file(info, pathname): def _readlock_file(pathname): return file(pathname).read() +def nlinks(pathname): + """Return number of hardlinks for the given file.""" + return os.stat(pathname).st_nlink + +if hasattr(os, 'link'): + os_link = os.link +else: + def os_link(src, dst): + raise OSError(0, "Hardlinks not supported") + # Platform specific variants if os.name == 'nt': nulldev = 'NUL:' + try: # ActivePython can create hard links using win32file module + import win32file + + def os_link(src, dst): # NB will only succeed on NTFS + win32file.CreateHardLink(dst, src) + + def nlinks(pathname): + """Return number of hardlinks for the given file.""" + try: + fh = win32file.CreateFile(pathname, + win32file.GENERIC_READ, win32file.FILE_SHARE_READ, + None, win32file.OPEN_EXISTING, 0, None) + res = win32file.GetFileInformationByHandle(fh) + fh.Close() + return res[7] + except: + return os.stat(pathname).st_nlink + + except ImportError: + pass + def is_exec(f, last): return last

#!/usr/bin/env python
#
# This is a generalized framework for converting between SCM
# repository formats.
#
# In its current form, it's hardcoded to convert incrementally between
# git and Mercurial.
#
# To use, you must first import the first git version into Mercurial,
# and establish a mapping between the git commit hash and the hash in
# Mercurial for that version. This mapping is kept in a simple text
# file with lines like so:
#
# <git hash> <mercurial hash>
#
# To convert the rest of the repo, run:
#
# convert-repo <git-dir> <hg-dir> <mapfile>
#
# This updates the mapfile on each commit copied, so it can be
# interrupted and can be run repeatedly to copy new commits.

import sys, os, zlib, sha, time
from mercurial import hg, ui, util

class convert_git:
    def __init__(self, path):
        self.path = path

    def getheads(self):
        h = file(self.path + "/.git/HEAD").read()[:-1]
        return [h]

    def catfile(self, rev, type):
        if rev == "0" * 40: raise IOError()
        path = os.getcwd()
        os.chdir(self.path)
        fh = os.popen("git-cat-file %s %s 2>/dev/null" % (type, rev))
        os.chdir(path)
        return fh.read()

    def getfile(self, name, rev):
        return self.catfile(rev, "blob")

    def getchanges(self, version):
        path = os.getcwd()
        os.chdir(self.path)
        fh = os.popen("git-diff-tree --root -m -r %s" % (version))
        os.chdir(path)

        changes = []
        for l in fh:
            if "\t" not in l: continue
            m, f = l[:-1].split("\t")
            m = m.split()
            h = m[3]
            p = (m[1] == "100755")
            changes.append((f, h, p))
        return changes

    def getcommit(self, version):
        c = self.catfile(version, "commit") # read the commit hash
        end = c.find("\n\n")
        message = c[end+2:]
        l = c[:end].splitlines()
        manifest = l[0].split()[1]
        parents = []
        for e in l[1:]:
            n,v = e.split(" ", 1)
            if n == "author":
                p = v.split()
                date = " ".join(p[-2:])
                author = " ".join(p[:-2])
                if author[0] == "<": author = author[1:-1]
            if n == "committer":
                p = v.split()
                date = " ".join(p[-2:])
                committer = " ".join(p[:-2])
                if committer[0] == "<": committer = committer[1:-1]
                message += "\ncommitter: %s %s\n" % (committer, date)
            if n == "parent": parents.append(v)
        return (parents, author, date, message)

    def gettags(self):
        tags = {}
        for f in os.listdir(self.path + "/.git/refs/tags"):
            try:
                h = file(self.path + "/.git/refs/tags/" + f).read().strip()
                tags[f] = h
            except:
                pass
        return tags

class convert_mercurial:
    def __init__(self, path):
        self.path = path
        u = ui.ui()
        self.repo = hg.repository(u, path)

    def getheads(self):
        h = self.repo.changelog.heads()
        h = [ hg.hex(x) for x in h ]
        return h

    def putfile(self, f, e, data):
        self.repo.wfile(f, "w").write(data)
        if self.repo.dirstate.state(f) == '?':
            self.repo.dirstate.update([f], "a")

        util.set_exec(self.repo.wjoin(f), e)

    def delfile(self, f):
        try:
            os.unlink(self.repo.wjoin(f))
            #self.repo.remove([f])
        except:
            pass

    def putcommit(self, files, parents, author, dest, text):
        seen = {}
        pl = []
        for p in parents:
            if p not in seen:
                pl.append(p)
                seen[p] = 1
        parents = pl

        if len(parents) < 2: parents.append("0" * 40)
        if len(parents) < 2: parents.append("0" * 40)
        p2 = parents.pop(0)

        while parents:
            p1 = p2
            p2 = parents.pop(0)
            self.repo.rawcommit(files, text, author, dest,
                                hg.bin(p1), hg.bin(p2))
            text = "(octopus merge fixup)\n"

        return hg.hex(self.repo.changelog.tip())

    def puttags(self, tags):
        try:
            old = self.repo.wfile(".hgtags").read()
            oldlines = old.splitlines(1)
            oldlines.sort()
        except:
            oldlines = []

        k = tags.keys()
        k.sort()
        newlines = []
        for tag in k:
            newlines.append("%s %s\n" % (tags[tag], tag))

        newlines.sort()

        if newlines != oldlines:
            print "updating tags"
            f = self.repo.wfile(".hgtags", "w")
            f.write("".join(newlines))
            f.close()
            if not oldlines: self.repo.add([".hgtags"])
            date = "%s 0" % time.mktime(time.gmtime())
            self.repo.rawcommit([".hgtags"], "update tags", "convert-repo",
                                date, self.repo.changelog.tip(), hg.nullid)

class convert:
    def __init__(self, source, dest, mapfile):
        self.source = source
        self.dest = dest
        self.mapfile = mapfile
        self.commitcache = {}

        self.map = {}
        for l in file(self.mapfile):
            sv, dv = l[:-1].split()
            self.map[sv] = dv

    def walktree(self, heads):
        visit = heads
        known = {}
        parents = {}
        while visit:
            n = visit.pop(0)
            if n in known or n in self.map: continue
            known[n] = 1
            self.commitcache[n] = self.source.getcommit(n)
            cp = self.commitcache[n][0]
            for p in cp:
                parents.setdefault(n, []).append(p)
                visit.append(p)

        return parents

    def toposort(self, parents):
        visit = parents.keys()
        seen = {}
        children = {}

        while visit:
            n = visit.pop(0)
            if n in seen: continue
            seen[n] = 1
            pc = 0
            if n in parents:
                for p in parents[n]:
                    if p not in self.map: pc += 1
                    visit.append(p)
                    children.setdefault(p, []).append(n)
            if not pc: root = n

        s = []
        removed = {}
        visit = children.keys()
        while visit:
            n = visit.pop(0)
            if n in removed: continue
            dep = 0
            if n in parents:
                for p in parents[n]:
                    if p in self.map: continue
                    if p not in removed:
                        # we're still dependent
                        visit.append(n)
                        dep = 1
                        break

            if not dep:
                # all n's parents are in the list
                removed[n] = 1
                s.append(n)
                if n in children:
                    for c in children[n]:
                        visit.insert(0, c)

        return s

    def copy(self, rev):
        p, a, d, t = self.commitcache[rev]
        files = self.source.getchanges(rev)

        for f,v,e in files:
            try:
                data = self.source.getfile(f, v)
            except IOError, inst:
                self.dest.delfile(f)
            else:
                self.dest.putfile(f, e, data)

        r = [self.map[v] for v in p]
        f = [f for f,v,e in files]
        self.map[rev] = self.dest.putcommit(f, r, a, d, t)
        file(self.mapfile, "a").write("%s %s\n" % (rev, self.map[rev]))

    def convert(self):
        heads = self.source.getheads()
        parents = self.walktree(heads)
        t = self.toposort(parents)
        num = len(t)

        for c in t:
            num -= 1
            if c in self.map: continue
            desc = self.commitcache[c][3].splitlines()[0]
            print num, desc
            self.copy(c)

        tags = self.source.gettags()
        ctags = {}
        for k in tags:
            v = tags[k]
            if v in self.map:
                ctags[k] = self.map[v]

        self.dest.puttags(ctags)

gitpath, hgpath, mapfile = sys.argv[1:]

c = convert(convert_git(gitpath), convert_mercurial(hgpath), mapfile)
c.convert()