comparison hgext/convert/subversion.py @ 4758:b6a1f2c46c6c

convert extension: Add SVN converter
author Daniel Holth <dholth@fastmail.fm>
date Sun, 01 Jul 2007 23:56:11 +0200
parents
children 95cbb6b74790
comparison
equal deleted inserted replaced
4757:6a16ef0d1c7c 4758:b6a1f2c46c6c
1 # Subversion 1.4/1.5 Python API backend
2 #
3 # Copyright(C) 2007 Daniel Holth et al
4
5 import pprint
6 import locale
7
8 from mercurial import util
9
10 # Subversion stuff. Works best with very recent Python SVN bindings
11 # e.g. SVN 1.5 or backports. Thanks to the bzr folks for enhancing
12 # these bindings.
13
14 from svn.core import SubversionException, Pool
15 import svn.core
16 import svn.ra
17 import svn.delta
18 import svn
19 import transport
20 from cStringIO import StringIO
21
22 from common import NoRepo, commit, converter_source, recode, nocommitmsg
23
24 class CompatibilityException(Exception): pass
25
26 nbRevisionsPerFetch = 50
27
28 class svn_entry(object):
29 """Emulate a Subversion path change."""
30 __slots__ = ['path', 'copyfrom_path', 'copyfrom_rev', 'action']
31 def __init__(self, entry):
32 self.copyfrom_path = entry.copyfrom_path
33 self.copyfrom_rev = entry.copyfrom_rev
34 self.action = entry.action
35
36 def __str__(self):
37 return "%s %s %s" % (self.action, self.copyfrom_path, self.copyfrom_rev)
38
39 def __repr__(self):
40 return self.__str__()
41
42 class svn_paths(object):
43 """Emulate a Subversion ordered dictionary of changed paths."""
44 __slots__ = ['values', 'order']
45 def __init__(self, orig_paths):
46 self.order = []
47 self.values = {}
48 if hasattr(orig_paths, 'keys'):
49 self.order = sorted(orig_paths.keys())
50 self.values.update(orig_paths)
51 return
52 if not orig_paths:
53 return
54 for path in orig_paths:
55 self.order.append(path)
56 self.values[path] = svn_entry(orig_paths[path])
57 self.order.sort() # maybe the order it came in isn't so great...
58
59 def __iter__(self):
60 return iter(self.order)
61
62 def __getitem__(self, key):
63 return self.values[key]
64
65 def __str__(self):
66 s = "{\n"
67 for path in self.order:
68 s += "'%s': %s,\n" % (path, self.values[path])
69 s += "}"
70 return s
71
72 def __repr__(self):
73 return self.__str__()
74
75 # SVN conversion code stolen from bzr-svn and tailor
76 class convert_svn(converter_source):
77 def __init__(self, ui, url):
78 self.ui = ui
79 self.encoding = locale.getpreferredencoding()
80 try:
81 # Support file://path@rev syntax. Useful e.g. to convert
82 # deleted branches.
83 url, latest = url.rsplit("@", 1)
84 latest = int(latest)
85 except ValueError, e:
86 latest = None
87 self.url = url
88 self.encoding = 'UTF-8' # Subversion is always nominal UTF-8
89 try:
90 self.transport = transport.SvnRaTransport(url = url)
91 self.ra = self.transport.ra
92 self.base = svn.ra.get_repos_root(self.ra)
93 self.module = self.url[len(self.base):]
94 self.modulemap = {} # revision, module
95 self.commits = {}
96 self.files = {}
97 self.uuid = svn.ra.get_uuid(self.ra).decode(self.encoding)
98 except SubversionException, e:
99 raise NoRepo("couldn't open SVN repo %s" % url)
100
101 try:
102 self.get_blacklist()
103 except IOError, e:
104 pass
105
106 if not latest:
107 latest = svn.ra.get_latest_revnum(self.ra)
108 dirent = svn.ra.stat(self.ra, self.module, latest)
109 self.last_changed = dirent.created_rev
110
111 self.head = self.rev(self.last_changed)
112
113 # Should lazily fetch revisions in batches of, say, 1,000...:
114 self._fetch_revisions(from_revnum=self.last_changed, to_revnum=0)
115
116 def rev(self, revnum):
117 return (u"svn:%s%s@%s" % (self.uuid, self.module, revnum)).decode(self.encoding)
118
119 def get_blacklist(self):
120 """Avoid certain revision numbers.
121 It is not uncommon for two nearby revisions to cancel each other
122 out, e.g. 'I copied trunk into a subdirectory of itself instead
123 of making a branch'. The converted repository is significantly
124 smaller if we ignore such revisions."""
125 self.blacklist = set()
126 blacklist = self.blacklist
127 for line in file("blacklist.txt", "r"):
128 if not line.startswith("#"):
129 try:
130 svn_rev = int(line.strip())
131 blacklist.add(svn_rev)
132 except ValueError, e:
133 pass # not an integer or a comment
134
135 def is_blacklisted(self, svn_rev):
136 return svn_rev in self.blacklist
137
138 def reparent(self, module):
139 svn_url = self.base + module
140 self.ui.debug("reparent to %s\n" % svn_url.encode(self.encoding))
141 svn.ra.reparent(self.ra, svn_url.encode(self.encoding))
142
143 def _fetch_revisions(self, from_revnum = 0, to_revnum = 347, pb=None):
144 self.parent_cset = None
145 self.child_cset = None
146
147 self.ui.debug('Fetching revisions %d to %d\n' % (from_revnum, to_revnum))
148
149 def get_entry_from_path(path, module=self.module):
150 # Given the repository url of this wc, say
151 # "http://server/plone/CMFPlone/branches/Plone-2_0-branch"
152 # extract the "entry" portion (a relative path) from what
153 # svn log --xml says, ie
154 # "/CMFPlone/branches/Plone-2_0-branch/tests/PloneTestCase.py"
155 # that is to say "tests/PloneTestCase.py"
156
157 if path.startswith(module):
158 relative = path[len(module):]
159 if relative.startswith('/'):
160 return relative[1:]
161 else:
162 return relative
163
164 # The path is outside our tracked tree...
165 self.ui.debug('Ignoring %r since it is not under %r\n' % (path, module))
166 return None
167
168 received = []
169 def rcvr(*arg, **args):
170 orig_paths, revnum, author, date, message, pool = arg
171 new_orig_paths = svn_paths(orig_paths)
172 rcvr2(new_orig_paths, revnum, author, date, message, pool)
173
174 def rcvr2(orig_paths, revnum, author, date, message, pool, better_paths = None):
175 if not self.is_blacklisted(revnum):
176 received.append((orig_paths, revnum, author, date, message))
177
178 def after_received(orig_paths, revnum, author, date, message):
179 if revnum == 1172:
180 import pdb
181 pdb.set_trace()
182 if revnum in self.modulemap:
183 new_module = self.modulemap[revnum]
184 if new_module != self.module:
185 self.module = new_module
186 self.reparent(self.module)
187
188 copyfrom = {} # Map of entrypath, revision for finding source of deleted revisions.
189 copies = {}
190 entries = []
191 self.ui.debug("Parsing revision %d\n" % revnum)
192 if orig_paths is not None:
193 rev = self.rev(revnum)
194 try:
195 branch = self.module.split("/")[-1]
196 except IndexError:
197 branch = None
198
199 for path in orig_paths:
200 # self.ui.write("path %s\n" % path)
201 if path == self.module: # Follow branching back in history
202 ent = orig_paths[path]
203 if ent:
204 if ent.copyfrom_path:
205 self.modulemap[ent.copyfrom_rev] = ent.copyfrom_path
206 else:
207 self.ui.debug("No copyfrom path, don't know what to do.\n")
208 # Maybe it was added and there is no more history.
209 entrypath = get_entry_from_path(path, module=self.module)
210 # self.ui.write("entrypath %s\n" % entrypath)
211 if not entrypath:
212 # Outside our area of interest
213 self.ui.debug("boring@%s: %s\n" % (revnum, path))
214 continue
215 entry = entrypath.decode(self.encoding)
216 ent = orig_paths[path]
217
218 kind = svn.ra.check_path(self.ra, entrypath, revnum)
219 if kind == svn.core.svn_node_file:
220 if ent.copyfrom_path:
221 copyfrom_path = get_entry_from_path(ent.copyfrom_path)
222 if copyfrom_path:
223 self.ui.debug("Copied to %s from %s@%s\n" % (entry, copyfrom_path, ent.copyfrom_rev))
224 # It's probably important for hg that the source
225 # exists in the revision's parent, not just the
226 # ent.copyfrom_rev
227 fromkind = svn.ra.check_path(self.ra, copyfrom_path, ent.copyfrom_rev)
228 if fromkind != 0:
229 copies[self.recode(entry)] = self.recode(copyfrom_path)
230 entries.append(self.recode(entry))
231 elif kind == 0: # gone, but had better be a deleted *file*
232 self.ui.debug("gone from %s\n" % ent.copyfrom_rev)
233
234 fromrev = revnum - 1
235 # might always need to be revnum - 1 in these 3 lines?
236 old_module = self.modulemap.get(fromrev, self.module)
237 basepath = old_module + "/" + get_entry_from_path(path, module=self.module)
238 entrypath = old_module + "/" + get_entry_from_path(path, module=self.module)
239
240 def lookup_parts(p):
241 rc = None
242 parts = p.split("/")
243 for i in range(len(parts)):
244 part = "/".join(parts[:i])
245 info = part, copyfrom.get(part, None)
246 if info[1] is not None:
247 self.ui.debug("Found parent directory %s\n" % info)
248 rc = info
249 return rc
250
251 self.ui.debug("base, entry %s %s\n" % (basepath, entrypath))
252
253 frompath, froment = lookup_parts(entrypath) or (None, revnum - 1)
254
255 # need to remove fragment from lookup_parts and replace with copyfrom_path
256 if frompath is not None:
257 self.ui.debug("munge-o-matic\n")
258 self.ui.debug(entrypath + '\n')
259 self.ui.debug(entrypath[len(frompath):] + '\n')
260 entrypath = froment.copyfrom_path + entrypath[len(frompath):]
261 fromrev = froment.copyfrom_rev
262 self.ui.debug("Info: %s %s %s %s\n" % (frompath, froment, ent, entrypath))
263
264 fromkind = svn.ra.check_path(self.ra, entrypath, fromrev)
265 if fromkind == svn.core.svn_node_file: # a deleted file
266 entries.append(self.recode(entry))
267 else:
268 # print "Deleted/moved non-file:", revnum, path, ent
269 # children = self._find_children(path, revnum - 1)
270 # print "find children %s@%d from %d action %s" % (path, revnum, ent.copyfrom_rev, ent.action)
271 # Sometimes this is tricky. For example: in
272 # The Subversion Repository revision 6940 a dir
273 # was copied and one of its files was deleted
274 # from the new location in the same commit. This
275 # code can't deal with that yet.
276 if ent.action == 'C':
277 children = self._find_children(path, fromrev)
278 else:
279 oroot = entrypath.strip('/')
280 nroot = path.strip('/')
281 children = self._find_children(oroot, fromrev)
282 children = [s.replace(oroot,nroot) for s in children]
283 # Mark all [files, not directories] as deleted.
284 for child in children:
285 # Can we move a child directory and its
286 # parent in the same commit? (probably can). Could
287 # cause problems if instead of revnum -1,
288 # we have to look in (copyfrom_path, revnum - 1)
289 entrypath = get_entry_from_path("/" + child, module=old_module)
290 if entrypath:
291 entry = self.recode(entrypath.decode(self.encoding))
292 if entry in copies:
293 # deleted file within a copy
294 del copies[entry]
295 else:
296 entries.append(entry)
297 elif kind == svn.core.svn_node_dir:
298 # Should probably synthesize normal file entries
299 # and handle as above to clean up copy/rename handling.
300
301 # If the directory just had a prop change,
302 # then we shouldn't need to look for its children.
303 # Also this could create duplicate entries. Not sure
304 # whether this will matter. Maybe should make entries a set.
305 # print "Changed directory", revnum, path, ent.action, ent.copyfrom_path, ent.copyfrom_rev
306 # This will fail if a directory was copied
307 # from another branch and then some of its files
308 # were deleted in the same transaction.
309 children = self._find_children(path, revnum)
310 children.sort()
311 for child in children:
312 # Can we move a child directory and its
313 # parent in the same commit? (probably can). Could
314 # cause problems if instead of revnum -1,
315 # we have to look in (copyfrom_path, revnum - 1)
316 entrypath = get_entry_from_path("/" + child, module=self.module)
317 # print child, self.module, entrypath
318 if entrypath:
319 # Need to filter out directories here...
320 kind = svn.ra.check_path(self.ra, entrypath, revnum)
321 if kind != svn.core.svn_node_dir:
322 entries.append(self.recode(entrypath))
323
324 # Copies here (must copy all from source)
325 # Probably not a real problem for us if
326 # source does not exist
327
328 # Can do this with the copy command "hg copy"
329 # if ent.copyfrom_path:
330 # copyfrom_entry = get_entry_from_path(ent.copyfrom_path.decode(self.encoding),
331 # module=self.module)
332 # copyto_entry = entrypath
333 #
334 # print "copy directory", copyfrom_entry, 'to', copyto_entry
335 #
336 # copies.append((copyfrom_entry, copyto_entry))
337
338 if ent.copyfrom_path:
339 copyfrom_path = ent.copyfrom_path.decode(self.encoding)
340 copyfrom_entry = get_entry_from_path(copyfrom_path, module=self.module)
341 if copyfrom_entry:
342 copyfrom[path] = ent
343 self.ui.debug("mark %s came from %s\n" % (path, copyfrom[path]))
344
345 # Good, /probably/ a regular copy. Really should check
346 # to see whether the parent revision actually contains
347 # the directory in question.
348 children = self._find_children(self.recode(copyfrom_path), ent.copyfrom_rev)
349 children.sort()
350 for child in children:
351 entrypath = get_entry_from_path("/" + child, module=self.module)
352 if entrypath:
353 entry = entrypath.decode(self.encoding)
354 # print "COPY COPY From", copyfrom_entry, entry
355 copyto_path = path + entry[len(copyfrom_entry):]
356 copyto_entry = get_entry_from_path(copyto_path, module=self.module)
357 # print "COPY", entry, "COPY To", copyto_entry
358 copies[self.recode(copyto_entry)] = self.recode(entry)
359 # copy from quux splort/quuxfile
360
361 self.modulemap[revnum] = self.module # track backwards in time
362 # a list of (filename, id) where id lets us retrieve the file.
363 # eg in git, id is the object hash. for svn it'll be the
364 self.files[rev] = zip(entries, [rev] * len(entries))
365
366 # Example SVN datetime. Includes microseconds.
367 # ISO-8601 conformant
368 # '2007-01-04T17:35:00.902377Z'
369 date = util.parsedate(date[:18] + " UTC", ["%Y-%m-%dT%H:%M:%S"])
370
371 log = message and self.recode(message) or nocommitmsg
372 author = author and self.recode(author) or ''
373
374 cset = commit(author=author,
375 date=util.datestr(date),
376 desc=log,
377 parents=[],
378 copies=copies,
379 branch=branch)
380
381 if self.child_cset is not None:
382 self.child_cset.parents = [rev]
383
384 self.child_cset = cset
385
386 self.commits[rev] = cset
387
388 try:
389 discover_changed_paths = True
390 strict_node_history = False
391 svn.ra.get_log(self.ra, [self.module], from_revnum, to_revnum,
392 0, discover_changed_paths, strict_node_history, rcvr)
393 for args in received:
394 after_received(*args)
395 self.last_revnum = to_revnum
396 except SubversionException, (_, num):
397 if num == svn.core.SVN_ERR_FS_NO_SUCH_REVISION:
398 raise NoSuchRevision(branch=self,
399 revision="Revision number %d" % to_revnum)
400 raise
401
402 def getheads(self):
403 # svn-url@rev
404 # Not safe if someone committed:
405 self.heads = [self.head]
406 # print self.commits.keys()
407 return self.heads
408
409 def _getfile(self, file, rev):
410 io = StringIO()
411 # TODO: ra.get_file transmits the whole file instead of diffs.
412 mode = ''
413 try:
414 revnum = int(rev.split("@")[-1])
415 if self.module != self.modulemap[revnum]:
416 self.module = self.modulemap[revnum]
417 self.reparent(self.module)
418 info = svn.ra.get_file(self.ra, file, revnum, io)
419 if isinstance(info, list):
420 info = info[-1]
421 mode = ("svn:executable" in info) and 'x' or ''
422 mode = ("svn:special" in info) and 'l' or mode
423 except SubversionException, e:
424 notfound = (svn.core.SVN_ERR_FS_NOT_FOUND,
425 svn.core.SVN_ERR_RA_DAV_PATH_NOT_FOUND)
426 if e.apr_err in notfound: # File not found
427 raise IOError()
428 raise
429 data = io.getvalue()
430 if mode == 'l':
431 link_prefix = "link "
432 if data.startswith(link_prefix):
433 data = data[len(link_prefix):]
434 return data, mode
435
436 def getfile(self, file, rev):
437 data, mode = self._getfile(file, rev)
438 self.modecache[(file, rev)] = mode
439 return data
440
441 def getmode(self, file, rev):
442 return self.modecache[(file, rev)]
443
444 def getchanges(self, rev):
445 self.modecache = {}
446 files = self.files[rev]
447 cl = files
448 cl.sort()
449 return cl
450
451 def getcommit(self, rev):
452 return self.commits[rev]
453
454 def gettags(self):
455 return []
456
457 def _find_children(self, path, revnum):
458 path = path.strip("/")
459
460 def _find_children_fallback(path, revnum):
461 # SWIG python bindings for getdir are broken up to at least 1.4.3
462 if not hasattr(self, 'client_ctx'):
463 self.client_ctx = svn.client.create_context()
464 optrev = svn.core.svn_opt_revision_t()
465 optrev.kind = svn.core.svn_opt_revision_number
466 optrev.value.number = revnum
467 rpath = '/'.join([self.url, path]).strip('/')
468 return ['%s/%s' % (path, x) for x in svn.client.ls(rpath, optrev, True, self.client_ctx).keys()]
469
470 if hasattr(self, '_find_children_fallback'):
471 return _find_children_fallback(path, revnum)
472
473 self.reparent("/" + path)
474 pool = Pool()
475
476 children = []
477 def find_children_inner(children, path, revnum = revnum):
478 if hasattr(svn.ra, 'get_dir2'): # Since SVN 1.4
479 fields = 0xffffffff # Binding does not provide SVN_DIRENT_ALL
480 getdir = svn.ra.get_dir2(self.ra, path, revnum, fields, pool)
481 else:
482 getdir = svn.ra.get_dir(self.ra, path, revnum, pool)
483 if type(getdir) == dict:
484 # python binding for getdir is broken up to at least 1.4.3
485 raise CompatibilityException()
486 dirents = getdir[0]
487 if type(dirents) == int:
488 # got here once due to infinite recursion bug
489 # pprint.pprint(getdir)
490 return
491 c = dirents.keys()
492 c.sort()
493 for child in c:
494 dirent = dirents[child]
495 if dirent.kind == svn.core.svn_node_dir:
496 find_children_inner(children, (path + "/" + child).strip("/"))
497 else:
498 children.append((path + "/" + child).strip("/"))
499
500 try:
501 find_children_inner(children, "")
502 except CompatibilityException:
503 self._find_children_fallback = True
504 self.reparent(self.module)
505 return _find_children_fallback(path, revnum)
506
507 self.reparent(self.module)
508 return [path + "/" + c for c in children]
509
510 def recode(self, s):
511 return recode(self.encoding, s)