comparison hgext/convert/__init__.py @ 4512:91709ba3cc88

Move convert-repo to hgext/convert/__init__.py
author Thomas Arendsen Hein <thomas@intevation.de>
date Wed, 06 Jun 2007 19:49:47 +0200
parents contrib/convert-repo@af013ae3ca10
children ac2fe196ac9b
comparison
equal deleted inserted replaced
4511:1d46169ec197 4512:91709ba3cc88
1 #!/usr/bin/env python
2 #
3 # This is a generalized framework for converting between SCM
4 # repository formats.
5 #
6 # To use, run:
7 #
8 # convert-repo <source> [<dest> [<mapfile>]]
9 #
10 # Currently accepted source formats: git, cvs
11 # Currently accepted destination formats: hg
12 #
13 # If destination isn't given, a new Mercurial repo named <src>-hg will
14 # be created. If <mapfile> isn't given, it will be put in a default
15 # location (<dest>/.hg/shamap by default)
16 #
17 # The <mapfile> is a simple text file that maps each source commit ID to
18 # the destination ID for that revision, like so:
19 #
20 # <source ID> <destination ID>
21 #
22 # If the file doesn't exist, it's automatically created. It's updated
23 # on each commit copied, so convert-repo can be interrupted and can
24 # be run repeatedly to copy new commits.
25
26 import sys, os, zlib, sha, time, re, locale, socket
27 os.environ["HGENCODING"] = "utf-8"
28 from mercurial import hg, ui, util, fancyopts
29
30 class Abort(Exception): pass
31 class NoRepo(Exception): pass
32
33 class commit(object):
34 def __init__(self, **parts):
35 for x in "author date desc parents".split():
36 if not x in parts:
37 abort("commit missing field %s\n" % x)
38 self.__dict__.update(parts)
39
40 quiet = 0
41 def status(msg):
42 if not quiet: sys.stdout.write(str(msg))
43
44 def warn(msg):
45 sys.stderr.write(str(msg))
46
47 def abort(msg):
48 raise Abort(msg)
49
50 def recode(s):
51 try:
52 return s.decode("utf-8").encode("utf-8")
53 except:
54 try:
55 return s.decode("latin-1").encode("utf-8")
56 except:
57 return s.decode("utf-8", "replace").encode("utf-8")
58
59 class converter_source(object):
60 """Conversion source interface"""
61
62 def __init__(self, path):
63 """Initialize conversion source (or raise NoRepo("message")
64 exception if path is not a valid repository)"""
65 raise NotImplementedError()
66
67 def getheads(self):
68 """Return a list of this repository's heads"""
69 raise NotImplementedError()
70
71 def getfile(self, name, rev):
72 """Return file contents as a string"""
73 raise NotImplementedError()
74
75 def getmode(self, name, rev):
76 """Return file mode, eg. '', 'x', or 'l'"""
77 raise NotImplementedError()
78
79 def getchanges(self, version):
80 """Return sorted list of (filename, id) tuples for all files changed in rev.
81
82 id just tells us which revision to return in getfile(), e.g. in
83 git it's an object hash."""
84 raise NotImplementedError()
85
86 def getcommit(self, version):
87 """Return the commit object for version"""
88 raise NotImplementedError()
89
90 def gettags(self):
91 """Return the tags as a dictionary of name: revision"""
92 raise NotImplementedError()
93
94 class converter_sink(object):
95 """Conversion sink (target) interface"""
96
97 def __init__(self, path):
98 """Initialize conversion sink (or raise NoRepo("message")
99 exception if path is not a valid repository)"""
100 raise NotImplementedError()
101
102 def getheads(self):
103 """Return a list of this repository's heads"""
104 raise NotImplementedError()
105
106 def mapfile(self):
107 """Path to a file that will contain lines
108 source_rev_id sink_rev_id
109 mapping equivalent revision identifiers for each system."""
110 raise NotImplementedError()
111
112 def putfile(self, f, e, data):
113 """Put file for next putcommit().
114 f: path to file
115 e: '', 'x', or 'l' (regular file, executable, or symlink)
116 data: file contents"""
117 raise NotImplementedError()
118
119 def delfile(self, f):
120 """Delete file for next putcommit().
121 f: path to file"""
122 raise NotImplementedError()
123
124 def putcommit(self, files, parents, commit):
125 """Create a revision with all changed files listed in 'files'
126 and having listed parents. 'commit' is a commit object containing
127 at a minimum the author, date, and message for this changeset.
128 Called after putfile() and delfile() calls. Note that the sink
129 repository is not told to update itself to a particular revision
130 (or even what that revision would be) before it receives the
131 file data."""
132 raise NotImplementedError()
133
134 def puttags(self, tags):
135 """Put tags into sink.
136 tags: {tagname: sink_rev_id, ...}"""
137 raise NotImplementedError()
138
139
140 # CVS conversion code inspired by hg-cvs-import and git-cvsimport
141 class convert_cvs(converter_source):
142 def __init__(self, path):
143 self.path = path
144 cvs = os.path.join(path, "CVS")
145 if not os.path.exists(cvs):
146 raise NoRepo("couldn't open CVS repo %s" % path)
147
148 self.changeset = {}
149 self.files = {}
150 self.tags = {}
151 self.lastbranch = {}
152 self.parent = {}
153 self.socket = None
154 self.cvsroot = file(os.path.join(cvs, "Root")).read()[:-1]
155 self.cvsrepo = file(os.path.join(cvs, "Repository")).read()[:-1]
156 self.encoding = locale.getpreferredencoding()
157 self._parse()
158 self._connect()
159
160 def _parse(self):
161 if self.changeset:
162 return
163
164 d = os.getcwd()
165 try:
166 os.chdir(self.path)
167 id = None
168 state = 0
169 for l in os.popen("cvsps -A -u --cvs-direct -q"):
170 if state == 0: # header
171 if l.startswith("PatchSet"):
172 id = l[9:-2]
173 elif l.startswith("Date"):
174 date = util.parsedate(l[6:-1], ["%Y/%m/%d %H:%M:%S"])
175 date = util.datestr(date)
176 elif l.startswith("Branch"):
177 branch = l[8:-1]
178 self.parent[id] = self.lastbranch.get(branch,'bad')
179 self.lastbranch[branch] = id
180 elif l.startswith("Ancestor branch"):
181 ancestor = l[17:-1]
182 self.parent[id] = self.lastbranch[ancestor]
183 elif l.startswith("Author"):
184 author = self.recode(l[8:-1])
185 elif l.startswith("Tag: "):
186 t = l[5:-1].rstrip()
187 if t != "(none)":
188 self.tags[t] = id
189 elif l.startswith("Log:"):
190 state = 1
191 log = ""
192 elif state == 1: # log
193 if l == "Members: \n":
194 files = {}
195 log = self.recode(log[:-1])
196 if log.isspace():
197 log = "*** empty log message ***\n"
198 state = 2
199 else:
200 log += l
201 elif state == 2:
202 if l == "\n": #
203 state = 0
204 p = [self.parent[id]]
205 if id == "1":
206 p = []
207 c = commit(author=author, date=date, parents=p,
208 desc=log, branch=branch)
209 self.changeset[id] = c
210 self.files[id] = files
211 else:
212 file,rev = l[1:-2].rsplit(':',1)
213 rev = rev.split("->")[1]
214 files[file] = rev
215
216 self.heads = self.lastbranch.values()
217 finally:
218 os.chdir(d)
219
220 def _connect(self):
221 root = self.cvsroot
222 conntype = None
223 user, host = None, None
224 cmd = ['cvs', 'server']
225
226 status("connecting to %s\n" % root)
227
228 if root.startswith(":pserver:"):
229 root = root[9:]
230 m = re.match(r'(?:(.*?)(?::(.*?))?@)?([^:\/]*)(?::(\d*))?(.*)', root)
231 if m:
232 conntype = "pserver"
233 user, passw, serv, port, root = m.groups()
234 if not user:
235 user = "anonymous"
236 rr = ":pserver:" + user + "@" + serv + ":" + root
237 if port:
238 rr2, port = "-", int(port)
239 else:
240 rr2, port = rr, 2401
241 rr += str(port)
242
243 if not passw:
244 passw = "A"
245 pf = open(os.path.join(os.environ["HOME"], ".cvspass"))
246 for l in pf:
247 # :pserver:cvs@mea.tmt.tele.fi:/cvsroot/zmailer Ah<Z
248 m = re.match(r'(/\d+\s+/)?(.*)', l)
249 l = m.group(2)
250 w, p = l.split(' ', 1)
251 if w in [rr, rr2]:
252 passw = p
253 break
254 pf.close()
255
256 sck = socket.socket()
257 sck.connect((serv, port))
258 sck.send("\n".join(["BEGIN AUTH REQUEST", root, user, passw, "END AUTH REQUEST", ""]))
259 if sck.recv(128) != "I LOVE YOU\n":
260 raise NoRepo("CVS pserver authentication failed")
261
262 self.writep = self.readp = sck.makefile('r+')
263
264 if not conntype and root.startswith(":local:"):
265 conntype = "local"
266 root = root[7:]
267
268 if not conntype:
269 # :ext:user@host/home/user/path/to/cvsroot
270 if root.startswith(":ext:"):
271 root = root[5:]
272 m = re.match(r'(?:([^@:/]+)@)?([^:/]+):?(.*)', root)
273 if not m:
274 conntype = "local"
275 else:
276 conntype = "rsh"
277 user, host, root = m.group(1), m.group(2), m.group(3)
278
279 if conntype != "pserver":
280 if conntype == "rsh":
281 rsh = os.environ.get("CVS_RSH" or "rsh")
282 if user:
283 cmd = [rsh, '-l', user, host] + cmd
284 else:
285 cmd = [rsh, host] + cmd
286
287 self.writep, self.readp = os.popen2(cmd)
288
289 self.realroot = root
290
291 self.writep.write("Root %s\n" % root)
292 self.writep.write("Valid-responses ok error Valid-requests Mode"
293 " M Mbinary E Checked-in Created Updated"
294 " Merged Removed\n")
295 self.writep.write("valid-requests\n")
296 self.writep.flush()
297 r = self.readp.readline()
298 if not r.startswith("Valid-requests"):
299 abort("server sucks\n")
300 if "UseUnchanged" in r:
301 self.writep.write("UseUnchanged\n")
302 self.writep.flush()
303 r = self.readp.readline()
304
305 def getheads(self):
306 return self.heads
307
308 def _getfile(self, name, rev):
309 if rev.endswith("(DEAD)"):
310 raise IOError
311
312 args = ("-N -P -kk -r %s --" % rev).split()
313 args.append(os.path.join(self.cvsrepo, name))
314 for x in args:
315 self.writep.write("Argument %s\n" % x)
316 self.writep.write("Directory .\n%s\nco\n" % self.realroot)
317 self.writep.flush()
318
319 data = ""
320 while 1:
321 line = self.readp.readline()
322 if line.startswith("Created ") or line.startswith("Updated "):
323 self.readp.readline() # path
324 self.readp.readline() # entries
325 mode = self.readp.readline()[:-1]
326 count = int(self.readp.readline()[:-1])
327 data = self.readp.read(count)
328 elif line.startswith(" "):
329 data += line[1:]
330 elif line.startswith("M "):
331 pass
332 elif line.startswith("Mbinary "):
333 count = int(self.readp.readline()[:-1])
334 data = self.readp.read(count)
335 else:
336 if line == "ok\n":
337 return (data, "x" in mode and "x" or "")
338 elif line.startswith("E "):
339 warn("cvs server: %s\n" % line[2:])
340 elif line.startswith("Remove"):
341 l = self.readp.readline()
342 l = self.readp.readline()
343 if l != "ok\n":
344 abort("unknown CVS response: %s\n" % l)
345 else:
346 abort("unknown CVS response: %s\n" % line)
347
348 def getfile(self, file, rev):
349 data, mode = self._getfile(file, rev)
350 self.modecache[(file, rev)] = mode
351 return data
352
353 def getmode(self, file, rev):
354 return self.modecache[(file, rev)]
355
356 def getchanges(self, rev):
357 self.modecache = {}
358 files = self.files[rev]
359 cl = files.items()
360 cl.sort()
361 return cl
362
363 def recode(self, text):
364 return text.decode(self.encoding, "replace").encode("utf-8")
365
366 def getcommit(self, rev):
367 return self.changeset[rev]
368
369 def gettags(self):
370 return self.tags
371
372 class convert_git(converter_source):
373 def __init__(self, path):
374 if os.path.isdir(path + "/.git"):
375 path += "/.git"
376 self.path = path
377 if not os.path.exists(path + "/objects"):
378 raise NoRepo("couldn't open GIT repo %s" % path)
379
380 def getheads(self):
381 fh = os.popen("GIT_DIR=%s git-rev-parse --verify HEAD" % self.path)
382 return [fh.read()[:-1]]
383
384 def catfile(self, rev, type):
385 if rev == "0" * 40: raise IOError()
386 fh = os.popen("GIT_DIR=%s git-cat-file %s %s 2>/dev/null" % (self.path, type, rev))
387 return fh.read()
388
389 def getfile(self, name, rev):
390 return self.catfile(rev, "blob")
391
392 def getmode(self, name, rev):
393 return self.modecache[(name, rev)]
394
395 def getchanges(self, version):
396 self.modecache = {}
397 fh = os.popen("GIT_DIR=%s git-diff-tree --root -m -r %s" % (self.path, version))
398 changes = []
399 for l in fh:
400 if "\t" not in l: continue
401 m, f = l[:-1].split("\t")
402 m = m.split()
403 h = m[3]
404 p = (m[1] == "100755")
405 s = (m[1] == "120000")
406 self.modecache[(f, h)] = (p and "x") or (s and "l") or ""
407 changes.append((f, h))
408 return changes
409
410 def getcommit(self, version):
411 c = self.catfile(version, "commit") # read the commit hash
412 end = c.find("\n\n")
413 message = c[end+2:]
414 message = recode(message)
415 l = c[:end].splitlines()
416 manifest = l[0].split()[1]
417 parents = []
418 for e in l[1:]:
419 n,v = e.split(" ", 1)
420 if n == "author":
421 p = v.split()
422 tm, tz = p[-2:]
423 author = " ".join(p[:-2])
424 if author[0] == "<": author = author[1:-1]
425 author = recode(author)
426 if n == "committer":
427 p = v.split()
428 tm, tz = p[-2:]
429 committer = " ".join(p[:-2])
430 if committer[0] == "<": committer = committer[1:-1]
431 committer = recode(committer)
432 message += "\ncommitter: %s\n" % committer
433 if n == "parent": parents.append(v)
434
435 tzs, tzh, tzm = tz[-5:-4] + "1", tz[-4:-2], tz[-2:]
436 tz = -int(tzs) * (int(tzh) * 3600 + int(tzm))
437 date = tm + " " + str(tz)
438
439 c = commit(parents=parents, date=date, author=author, desc=message)
440 return c
441
442 def gettags(self):
443 tags = {}
444 fh = os.popen('git-ls-remote --tags "%s" 2>/dev/null' % self.path)
445 prefix = 'refs/tags/'
446 for line in fh:
447 line = line.strip()
448 if not line.endswith("^{}"):
449 continue
450 node, tag = line.split(None, 1)
451 if not tag.startswith(prefix):
452 continue
453 tag = tag[len(prefix):-3]
454 tags[tag] = node
455
456 return tags
457
458 class convert_mercurial(converter_sink):
459 def __init__(self, path):
460 self.path = path
461 u = ui.ui()
462 try:
463 self.repo = hg.repository(u, path)
464 except:
465 raise NoRepo("could open hg repo %s" % path)
466
467 def mapfile(self):
468 return os.path.join(self.path, ".hg", "shamap")
469
470 def getheads(self):
471 h = self.repo.changelog.heads()
472 return [ hg.hex(x) for x in h ]
473
474 def putfile(self, f, e, data):
475 self.repo.wwrite(f, data, e)
476 if self.repo.dirstate.state(f) == '?':
477 self.repo.dirstate.update([f], "a")
478
479 def delfile(self, f):
480 try:
481 os.unlink(self.repo.wjoin(f))
482 #self.repo.remove([f])
483 except:
484 pass
485
486 def putcommit(self, files, parents, commit):
487 seen = {}
488 pl = []
489 for p in parents:
490 if p not in seen:
491 pl.append(p)
492 seen[p] = 1
493 parents = pl
494
495 if len(parents) < 2: parents.append("0" * 40)
496 if len(parents) < 2: parents.append("0" * 40)
497 p2 = parents.pop(0)
498
499 text = commit.desc
500 extra = {}
501 try:
502 extra["branch"] = commit.branch
503 except AttributeError:
504 pass
505
506 while parents:
507 p1 = p2
508 p2 = parents.pop(0)
509 a = self.repo.rawcommit(files, text, commit.author, commit.date,
510 hg.bin(p1), hg.bin(p2), extra=extra)
511 text = "(octopus merge fixup)\n"
512 p2 = hg.hex(self.repo.changelog.tip())
513
514 return p2
515
516 def puttags(self, tags):
517 try:
518 old = self.repo.wfile(".hgtags").read()
519 oldlines = old.splitlines(1)
520 oldlines.sort()
521 except:
522 oldlines = []
523
524 k = tags.keys()
525 k.sort()
526 newlines = []
527 for tag in k:
528 newlines.append("%s %s\n" % (tags[tag], tag))
529
530 newlines.sort()
531
532 if newlines != oldlines:
533 status("updating tags\n")
534 f = self.repo.wfile(".hgtags", "w")
535 f.write("".join(newlines))
536 f.close()
537 if not oldlines: self.repo.add([".hgtags"])
538 date = "%s 0" % int(time.mktime(time.gmtime()))
539 self.repo.rawcommit([".hgtags"], "update tags", "convert-repo",
540 date, self.repo.changelog.tip(), hg.nullid)
541 return hg.hex(self.repo.changelog.tip())
542
543 converters = [convert_cvs, convert_git, convert_mercurial]
544
545 def converter(path):
546 if not os.path.isdir(path):
547 abort("%s: not a directory\n" % path)
548 for c in converters:
549 try:
550 return c(path)
551 except NoRepo:
552 pass
553 abort("%s: unknown repository type\n" % path)
554
555 class convert(object):
556 def __init__(self, source, dest, mapfile, opts):
557
558 self.source = source
559 self.dest = dest
560 self.mapfile = mapfile
561 self.opts = opts
562 self.commitcache = {}
563
564 self.map = {}
565 try:
566 for l in file(self.mapfile):
567 sv, dv = l[:-1].split()
568 self.map[sv] = dv
569 except IOError:
570 pass
571
572 def walktree(self, heads):
573 visit = heads
574 known = {}
575 parents = {}
576 while visit:
577 n = visit.pop(0)
578 if n in known or n in self.map: continue
579 known[n] = 1
580 self.commitcache[n] = self.source.getcommit(n)
581 cp = self.commitcache[n].parents
582 for p in cp:
583 parents.setdefault(n, []).append(p)
584 visit.append(p)
585
586 return parents
587
588 def toposort(self, parents):
589 visit = parents.keys()
590 seen = {}
591 children = {}
592
593 while visit:
594 n = visit.pop(0)
595 if n in seen: continue
596 seen[n] = 1
597 pc = 0
598 if n in parents:
599 for p in parents[n]:
600 if p not in self.map: pc += 1
601 visit.append(p)
602 children.setdefault(p, []).append(n)
603 if not pc: root = n
604
605 s = []
606 removed = {}
607 visit = children.keys()
608 while visit:
609 n = visit.pop(0)
610 if n in removed: continue
611 dep = 0
612 if n in parents:
613 for p in parents[n]:
614 if p in self.map: continue
615 if p not in removed:
616 # we're still dependent
617 visit.append(n)
618 dep = 1
619 break
620
621 if not dep:
622 # all n's parents are in the list
623 removed[n] = 1
624 if n not in self.map:
625 s.append(n)
626 if n in children:
627 for c in children[n]:
628 visit.insert(0, c)
629
630 if opts.get('datesort'):
631 depth = {}
632 for n in s:
633 depth[n] = 0
634 pl = [p for p in self.commitcache[n].parents if p not in self.map]
635 if pl:
636 depth[n] = max([depth[p] for p in pl]) + 1
637
638 s = [(depth[n], self.commitcache[n].date, n) for n in s]
639 s.sort()
640 s = [e[2] for e in s]
641
642 return s
643
644 def copy(self, rev):
645 c = self.commitcache[rev]
646 files = self.source.getchanges(rev)
647
648 for f,v in files:
649 try:
650 data = self.source.getfile(f, v)
651 except IOError, inst:
652 self.dest.delfile(f)
653 else:
654 e = self.source.getmode(f, v)
655 self.dest.putfile(f, e, data)
656
657 r = [self.map[v] for v in c.parents]
658 f = [f for f,v in files]
659 self.map[rev] = self.dest.putcommit(f, r, c)
660 file(self.mapfile, "a").write("%s %s\n" % (rev, self.map[rev]))
661
662 def convert(self):
663 status("scanning source...\n")
664 heads = self.source.getheads()
665 parents = self.walktree(heads)
666 status("sorting...\n")
667 t = self.toposort(parents)
668 num = len(t)
669 c = None
670
671 status("converting...\n")
672 for c in t:
673 num -= 1
674 desc = self.commitcache[c].desc
675 if "\n" in desc:
676 desc = desc.splitlines()[0]
677 status("%d %s\n" % (num, desc))
678 self.copy(c)
679
680 tags = self.source.gettags()
681 ctags = {}
682 for k in tags:
683 v = tags[k]
684 if v in self.map:
685 ctags[k] = self.map[v]
686
687 if c and ctags:
688 nrev = self.dest.puttags(ctags)
689 # write another hash correspondence to override the previous
690 # one so we don't end up with extra tag heads
691 if nrev:
692 file(self.mapfile, "a").write("%s %s\n" % (c, nrev))
693
694 def command(src, dest=None, mapfile=None, **opts):
695 srcc = converter(src)
696 if not hasattr(srcc, "getcommit"):
697 abort("%s: can't read from this repo type\n" % src)
698
699 if not dest:
700 dest = src + "-hg"
701 status("assuming destination %s\n" % dest)
702 if not os.path.isdir(dest):
703 status("creating repository %s\n" % dest)
704 os.system("hg init " + dest)
705 destc = converter(dest)
706 if not hasattr(destc, "putcommit"):
707 abort("%s: can't write to this repo type\n" % src)
708
709 if not mapfile:
710 try:
711 mapfile = destc.mapfile()
712 except:
713 mapfile = os.path.join(destc, "map")
714
715 c = convert(srcc, destc, mapfile, opts)
716 c.convert()
717
718 options = [('q', 'quiet', None, 'suppress output'),
719 ('', 'datesort', None, 'try to sort changesets by date')]
720 opts = {}
721 args = fancyopts.fancyopts(sys.argv[1:], options, opts)
722
723 if opts['quiet']:
724 quiet = 1
725
726 try:
727 command(*args, **opts)
728 except Abort, inst:
729 warn(inst)
730 except KeyboardInterrupt:
731 status("interrupted\n")