hgext/convert/subversion.py
changeset 4758 b6a1f2c46c6c
child 4759 95cbb6b74790
equal deleted inserted replaced
4757:6a16ef0d1c7c 4758:b6a1f2c46c6c
       
     1 # Subversion 1.4/1.5 Python API backend
       
     2 #
       
     3 # Copyright(C) 2007 Daniel Holth et al
       
     4 
       
     5 import pprint
       
     6 import locale
       
     7 
       
     8 from mercurial import util
       
     9 
       
    10 # Subversion stuff. Works best with very recent Python SVN bindings
       
    11 # e.g. SVN 1.5 or backports. Thanks to the bzr folks for enhancing
       
    12 # these bindings.
       
    13 
       
    14 from svn.core import SubversionException, Pool
       
    15 import svn.core
       
    16 import svn.ra
       
    17 import svn.delta
       
    18 import svn
       
    19 import transport
       
    20 from cStringIO import StringIO
       
    21 
       
    22 from common import NoRepo, commit, converter_source, recode, nocommitmsg
       
    23 
       
    24 class CompatibilityException(Exception): pass
       
    25 
       
    26 nbRevisionsPerFetch = 50
       
    27 
       
    28 class svn_entry(object):
       
    29     """Emulate a Subversion path change."""
       
    30     __slots__ = ['path', 'copyfrom_path', 'copyfrom_rev', 'action']
       
    31     def __init__(self, entry):
       
    32         self.copyfrom_path = entry.copyfrom_path
       
    33         self.copyfrom_rev = entry.copyfrom_rev
       
    34         self.action = entry.action
       
    35 
       
    36     def __str__(self):
       
    37         return "%s %s %s" % (self.action, self.copyfrom_path, self.copyfrom_rev)
       
    38 
       
    39     def __repr__(self):
       
    40         return self.__str__()
       
    41 
       
    42 class svn_paths(object):
       
    43     """Emulate a Subversion ordered dictionary of changed paths."""
       
    44     __slots__ = ['values', 'order']
       
    45     def __init__(self, orig_paths):
       
    46         self.order = []
       
    47         self.values = {}
       
    48         if hasattr(orig_paths, 'keys'):
       
    49             self.order = sorted(orig_paths.keys())
       
    50             self.values.update(orig_paths)
       
    51             return
       
    52         if not orig_paths:
       
    53             return
       
    54         for path in orig_paths:
       
    55             self.order.append(path)
       
    56             self.values[path] = svn_entry(orig_paths[path])
       
    57         self.order.sort() # maybe the order it came in isn't so great...
       
    58 
       
    59     def __iter__(self):
       
    60         return iter(self.order)
       
    61 
       
    62     def __getitem__(self, key):
       
    63         return self.values[key]
       
    64 
       
    65     def __str__(self):
       
    66         s = "{\n"
       
    67         for path in self.order:
       
    68             s += "'%s': %s,\n" % (path, self.values[path])
       
    69         s += "}"
       
    70         return s
       
    71     
       
    72     def __repr__(self):
       
    73         return self.__str__()
       
    74 
       
    75 # SVN conversion code stolen from bzr-svn and tailor
       
    76 class convert_svn(converter_source):
       
    77     def __init__(self, ui, url):
       
    78         self.ui = ui
       
    79         self.encoding = locale.getpreferredencoding()
       
    80         try:
       
    81             # Support file://path@rev syntax. Useful e.g. to convert
       
    82             # deleted branches.
       
    83             url, latest = url.rsplit("@", 1)
       
    84             latest = int(latest)
       
    85         except ValueError, e:
       
    86             latest = None
       
    87         self.url = url
       
    88         self.encoding = 'UTF-8' # Subversion is always nominal UTF-8
       
    89         try:
       
    90             self.transport = transport.SvnRaTransport(url = url)
       
    91             self.ra = self.transport.ra
       
    92             self.base = svn.ra.get_repos_root(self.ra)
       
    93             self.module = self.url[len(self.base):]
       
    94             self.modulemap = {} # revision, module
       
    95             self.commits = {}
       
    96             self.files = {}
       
    97             self.uuid = svn.ra.get_uuid(self.ra).decode(self.encoding)
       
    98         except SubversionException, e:
       
    99             raise NoRepo("couldn't open SVN repo %s" % url)
       
   100 
       
   101         try:
       
   102             self.get_blacklist()
       
   103         except IOError, e:
       
   104             pass
       
   105 
       
   106         if not latest:
       
   107             latest = svn.ra.get_latest_revnum(self.ra)
       
   108         dirent = svn.ra.stat(self.ra, self.module, latest)
       
   109         self.last_changed = dirent.created_rev
       
   110 
       
   111         self.head = self.rev(self.last_changed)
       
   112 
       
   113         # Should lazily fetch revisions in batches of, say, 1,000...:
       
   114         self._fetch_revisions(from_revnum=self.last_changed, to_revnum=0)
       
   115 
       
   116     def rev(self, revnum):
       
   117         return (u"svn:%s%s@%s" % (self.uuid, self.module, revnum)).decode(self.encoding)
       
   118             
       
   119     def get_blacklist(self):
       
   120         """Avoid certain revision numbers.
       
   121         It is not uncommon for two nearby revisions to cancel each other
       
   122         out, e.g. 'I copied trunk into a subdirectory of itself instead
       
   123         of making a branch'. The converted repository is significantly
       
   124         smaller if we ignore such revisions."""
       
   125         self.blacklist = set()
       
   126         blacklist = self.blacklist
       
   127         for line in file("blacklist.txt", "r"):
       
   128             if not line.startswith("#"):
       
   129                 try:
       
   130                     svn_rev = int(line.strip())
       
   131                     blacklist.add(svn_rev)
       
   132                 except ValueError, e:
       
   133                     pass # not an integer or a comment
       
   134 
       
   135     def is_blacklisted(self, svn_rev):
       
   136         return svn_rev in self.blacklist
       
   137 
       
   138     def reparent(self, module):
       
   139         svn_url = self.base + module
       
   140         self.ui.debug("reparent to %s\n" % svn_url.encode(self.encoding))
       
   141         svn.ra.reparent(self.ra, svn_url.encode(self.encoding))
       
   142 
       
   143     def _fetch_revisions(self, from_revnum = 0, to_revnum = 347, pb=None):
       
   144         self.parent_cset = None
       
   145         self.child_cset = None
       
   146         
       
   147         self.ui.debug('Fetching revisions %d to %d\n' % (from_revnum, to_revnum))
       
   148 
       
   149         def get_entry_from_path(path, module=self.module):
       
   150             # Given the repository url of this wc, say
       
   151             #   "http://server/plone/CMFPlone/branches/Plone-2_0-branch"
       
   152             # extract the "entry" portion (a relative path) from what
       
   153             # svn log --xml says, ie
       
   154             #   "/CMFPlone/branches/Plone-2_0-branch/tests/PloneTestCase.py"
       
   155             # that is to say "tests/PloneTestCase.py"
       
   156 
       
   157             if path.startswith(module):
       
   158                 relative = path[len(module):]
       
   159                 if relative.startswith('/'):
       
   160                     return relative[1:]
       
   161                 else:
       
   162                     return relative
       
   163 
       
   164             # The path is outside our tracked tree...
       
   165             self.ui.debug('Ignoring %r since it is not under %r\n' % (path, module))
       
   166             return None
       
   167 
       
   168         received = []
       
   169         def rcvr(*arg, **args):
       
   170             orig_paths, revnum, author, date, message, pool = arg
       
   171             new_orig_paths = svn_paths(orig_paths)
       
   172             rcvr2(new_orig_paths, revnum, author, date, message, pool)
       
   173 
       
   174         def rcvr2(orig_paths, revnum, author, date, message, pool, better_paths = None):
       
   175             if not self.is_blacklisted(revnum):
       
   176                 received.append((orig_paths, revnum, author, date, message))
       
   177            
       
   178         def after_received(orig_paths, revnum, author, date, message):
       
   179             if revnum == 1172:
       
   180                 import pdb
       
   181                 pdb.set_trace()
       
   182             if revnum in self.modulemap:
       
   183                 new_module = self.modulemap[revnum]
       
   184                 if new_module != self.module:
       
   185                     self.module = new_module
       
   186                     self.reparent(self.module)
       
   187 
       
   188             copyfrom = {} # Map of entrypath, revision for finding source of deleted revisions.
       
   189             copies = {}
       
   190             entries = []
       
   191             self.ui.debug("Parsing revision %d\n" % revnum)
       
   192             if orig_paths is not None:
       
   193                 rev = self.rev(revnum)
       
   194                 try:
       
   195                     branch = self.module.split("/")[-1]
       
   196                 except IndexError:
       
   197                     branch = None
       
   198                 
       
   199                 for path in orig_paths:
       
   200                     # self.ui.write("path %s\n" % path)
       
   201                     if path == self.module: # Follow branching back in history
       
   202                         ent = orig_paths[path]
       
   203                         if ent:
       
   204                             if ent.copyfrom_path:
       
   205                                 self.modulemap[ent.copyfrom_rev] = ent.copyfrom_path
       
   206                             else:
       
   207                                 self.ui.debug("No copyfrom path, don't know what to do.\n")
       
   208                                 # Maybe it was added and there is no more history.
       
   209                     entrypath = get_entry_from_path(path, module=self.module)
       
   210                     # self.ui.write("entrypath %s\n" % entrypath)
       
   211                     if not entrypath:
       
   212                         # Outside our area of interest
       
   213                         self.ui.debug("boring@%s: %s\n" % (revnum, path))
       
   214                         continue
       
   215                     entry = entrypath.decode(self.encoding)
       
   216                     ent = orig_paths[path]
       
   217 
       
   218                     kind = svn.ra.check_path(self.ra, entrypath, revnum)
       
   219                     if kind == svn.core.svn_node_file:
       
   220                         if ent.copyfrom_path:
       
   221                             copyfrom_path = get_entry_from_path(ent.copyfrom_path)
       
   222                             if copyfrom_path:
       
   223                                 self.ui.debug("Copied to %s from %s@%s\n" % (entry, copyfrom_path, ent.copyfrom_rev))
       
   224                                 # It's probably important for hg that the source
       
   225                                 # exists in the revision's parent, not just the
       
   226                                 # ent.copyfrom_rev
       
   227                                 fromkind = svn.ra.check_path(self.ra, copyfrom_path, ent.copyfrom_rev)
       
   228                                 if fromkind != 0:
       
   229                                     copies[self.recode(entry)] = self.recode(copyfrom_path)
       
   230                         entries.append(self.recode(entry))
       
   231                     elif kind == 0: # gone, but had better be a deleted *file*
       
   232                         self.ui.debug("gone from %s\n" % ent.copyfrom_rev)
       
   233 
       
   234                         fromrev = revnum - 1
       
   235                         # might always need to be revnum - 1 in these 3 lines?
       
   236                         old_module = self.modulemap.get(fromrev, self.module)
       
   237                         basepath = old_module + "/" + get_entry_from_path(path, module=self.module)
       
   238                         entrypath = old_module + "/" + get_entry_from_path(path, module=self.module)
       
   239 
       
   240                         def lookup_parts(p):
       
   241                             rc = None
       
   242                             parts = p.split("/")
       
   243                             for i in range(len(parts)):
       
   244                                 part = "/".join(parts[:i])
       
   245                                 info = part, copyfrom.get(part, None)
       
   246                                 if info[1] is not None:
       
   247                                     self.ui.debug("Found parent directory %s\n" % info)
       
   248                                     rc = info
       
   249                             return rc
       
   250 
       
   251                         self.ui.debug("base, entry %s %s\n" % (basepath, entrypath))
       
   252 
       
   253                         frompath, froment = lookup_parts(entrypath) or (None, revnum - 1)
       
   254 
       
   255                         # need to remove fragment from lookup_parts and replace with copyfrom_path
       
   256                         if frompath is not None:
       
   257                             self.ui.debug("munge-o-matic\n")
       
   258                             self.ui.debug(entrypath + '\n')
       
   259                             self.ui.debug(entrypath[len(frompath):] + '\n')
       
   260                             entrypath = froment.copyfrom_path + entrypath[len(frompath):]
       
   261                             fromrev = froment.copyfrom_rev
       
   262                             self.ui.debug("Info: %s %s %s %s\n" % (frompath, froment, ent, entrypath))
       
   263 
       
   264                         fromkind = svn.ra.check_path(self.ra, entrypath, fromrev)
       
   265                         if fromkind == svn.core.svn_node_file:   # a deleted file
       
   266                             entries.append(self.recode(entry))
       
   267                         else:
       
   268                             # print "Deleted/moved non-file:", revnum, path, ent
       
   269                             # children = self._find_children(path, revnum - 1)
       
   270                             # print "find children %s@%d from %d action %s" % (path, revnum, ent.copyfrom_rev, ent.action)
       
   271                             # Sometimes this is tricky. For example: in
       
   272                             # The Subversion Repository revision 6940 a dir
       
   273                             # was copied and one of its files was deleted 
       
   274                             # from the new location in the same commit. This
       
   275                             # code can't deal with that yet.
       
   276                             if ent.action == 'C':
       
   277                                 children = self._find_children(path, fromrev)
       
   278                             else:
       
   279                                 oroot = entrypath.strip('/')
       
   280                                 nroot = path.strip('/')
       
   281                                 children = self._find_children(oroot, fromrev)
       
   282                                 children = [s.replace(oroot,nroot) for s in children]
       
   283                             # Mark all [files, not directories] as deleted.
       
   284                             for child in children:
       
   285                                 # Can we move a child directory and its
       
   286                                 # parent in the same commit? (probably can). Could
       
   287                                 # cause problems if instead of revnum -1, 
       
   288                                 # we have to look in (copyfrom_path, revnum - 1)
       
   289                                 entrypath = get_entry_from_path("/" + child, module=old_module)
       
   290                                 if entrypath:
       
   291                                     entry = self.recode(entrypath.decode(self.encoding))
       
   292                                     if entry in copies:
       
   293                                         # deleted file within a copy
       
   294                                         del copies[entry]
       
   295                                     else:
       
   296                                         entries.append(entry)
       
   297                     elif kind == svn.core.svn_node_dir:
       
   298                         # Should probably synthesize normal file entries
       
   299                         # and handle as above to clean up copy/rename handling.
       
   300 
       
   301                         # If the directory just had a prop change,
       
   302                         # then we shouldn't need to look for its children.
       
   303                         # Also this could create duplicate entries. Not sure
       
   304                         # whether this will matter. Maybe should make entries a set.
       
   305                         # print "Changed directory", revnum, path, ent.action, ent.copyfrom_path, ent.copyfrom_rev
       
   306                         # This will fail if a directory was copied
       
   307                         # from another branch and then some of its files
       
   308                         # were deleted in the same transaction.
       
   309                         children = self._find_children(path, revnum)
       
   310                         children.sort()
       
   311                         for child in children:
       
   312                             # Can we move a child directory and its
       
   313                             # parent in the same commit? (probably can). Could
       
   314                             # cause problems if instead of revnum -1, 
       
   315                             # we have to look in (copyfrom_path, revnum - 1)
       
   316                             entrypath = get_entry_from_path("/" + child, module=self.module)
       
   317                             # print child, self.module, entrypath
       
   318                             if entrypath:
       
   319                                 # Need to filter out directories here...
       
   320                                 kind = svn.ra.check_path(self.ra, entrypath, revnum)
       
   321                                 if kind != svn.core.svn_node_dir:
       
   322                                     entries.append(self.recode(entrypath))
       
   323 
       
   324                         # Copies here (must copy all from source)
       
   325                         # Probably not a real problem for us if
       
   326                         # source does not exist
       
   327 
       
   328                         # Can do this with the copy command "hg copy"
       
   329                         # if ent.copyfrom_path:
       
   330                         #     copyfrom_entry = get_entry_from_path(ent.copyfrom_path.decode(self.encoding),
       
   331                         #             module=self.module)
       
   332                         #     copyto_entry = entrypath
       
   333                         #
       
   334                         #     print "copy directory", copyfrom_entry, 'to', copyto_entry
       
   335                         #
       
   336                         #     copies.append((copyfrom_entry, copyto_entry))
       
   337                         
       
   338                         if ent.copyfrom_path:
       
   339                             copyfrom_path = ent.copyfrom_path.decode(self.encoding)
       
   340                             copyfrom_entry = get_entry_from_path(copyfrom_path, module=self.module)
       
   341                             if copyfrom_entry:
       
   342                                 copyfrom[path] = ent
       
   343                                 self.ui.debug("mark %s came from %s\n" % (path, copyfrom[path]))
       
   344 
       
   345                                 # Good, /probably/ a regular copy. Really should check
       
   346                                 # to see whether the parent revision actually contains
       
   347                                 # the directory in question.
       
   348                                 children = self._find_children(self.recode(copyfrom_path), ent.copyfrom_rev)
       
   349                                 children.sort()
       
   350                                 for child in children:
       
   351                                     entrypath = get_entry_from_path("/" + child, module=self.module)
       
   352                                     if entrypath:
       
   353                                         entry = entrypath.decode(self.encoding)
       
   354                                         # print "COPY COPY From", copyfrom_entry, entry
       
   355                                         copyto_path = path + entry[len(copyfrom_entry):]
       
   356                                         copyto_entry =  get_entry_from_path(copyto_path, module=self.module)
       
   357                                         # print "COPY", entry, "COPY To", copyto_entry
       
   358                                         copies[self.recode(copyto_entry)] = self.recode(entry)
       
   359                                         # copy from quux splort/quuxfile
       
   360               
       
   361                 self.modulemap[revnum] = self.module # track backwards in time
       
   362                 # a list of (filename, id) where id lets us retrieve the file.
       
   363                 # eg in git, id is the object hash. for svn it'll be the 
       
   364                 self.files[rev] = zip(entries, [rev] * len(entries))
       
   365 
       
   366                 # Example SVN datetime. Includes microseconds.
       
   367                 # ISO-8601 conformant
       
   368                 # '2007-01-04T17:35:00.902377Z'
       
   369                 date = util.parsedate(date[:18] + " UTC", ["%Y-%m-%dT%H:%M:%S"])
       
   370 
       
   371                 log = message and self.recode(message) or nocommitmsg
       
   372                 author = author and self.recode(author) or ''
       
   373 
       
   374                 cset = commit(author=author,
       
   375                         date=util.datestr(date), 
       
   376                         desc=log, 
       
   377                         parents=[],
       
   378                         copies=copies,
       
   379                         branch=branch)
       
   380 
       
   381                 if self.child_cset is not None:
       
   382                     self.child_cset.parents = [rev]
       
   383 
       
   384                 self.child_cset = cset
       
   385 
       
   386                 self.commits[rev] = cset
       
   387 
       
   388         try:
       
   389             discover_changed_paths = True
       
   390             strict_node_history = False
       
   391             svn.ra.get_log(self.ra, [self.module], from_revnum, to_revnum, 
       
   392                            0, discover_changed_paths, strict_node_history, rcvr)
       
   393             for args in received:
       
   394                 after_received(*args)
       
   395             self.last_revnum = to_revnum
       
   396         except SubversionException, (_, num):
       
   397             if num == svn.core.SVN_ERR_FS_NO_SUCH_REVISION:
       
   398                 raise NoSuchRevision(branch=self, 
       
   399                     revision="Revision number %d" % to_revnum)
       
   400             raise
       
   401 
       
   402     def getheads(self):
       
   403         # svn-url@rev
       
   404         # Not safe if someone committed:
       
   405         self.heads = [self.head]
       
   406         # print self.commits.keys()
       
   407         return self.heads
       
   408 
       
   409     def _getfile(self, file, rev):
       
   410         io = StringIO()
       
   411         # TODO: ra.get_file transmits the whole file instead of diffs.
       
   412         mode = ''
       
   413         try:
       
   414             revnum = int(rev.split("@")[-1])
       
   415             if self.module != self.modulemap[revnum]:
       
   416                 self.module = self.modulemap[revnum]
       
   417                 self.reparent(self.module)
       
   418             info = svn.ra.get_file(self.ra, file, revnum, io)
       
   419             if isinstance(info, list):
       
   420                 info = info[-1]
       
   421             mode = ("svn:executable" in info) and 'x' or ''
       
   422             mode = ("svn:special" in info) and 'l' or mode
       
   423         except SubversionException, e:
       
   424             notfound = (svn.core.SVN_ERR_FS_NOT_FOUND,
       
   425                 svn.core.SVN_ERR_RA_DAV_PATH_NOT_FOUND)
       
   426             if e.apr_err in notfound: # File not found
       
   427                 raise IOError()
       
   428             raise
       
   429         data = io.getvalue()
       
   430         if mode == 'l':
       
   431             link_prefix = "link "
       
   432             if data.startswith(link_prefix):
       
   433                 data = data[len(link_prefix):]
       
   434         return data, mode
       
   435 
       
   436     def getfile(self, file, rev):
       
   437         data, mode = self._getfile(file, rev)
       
   438         self.modecache[(file, rev)] = mode
       
   439         return data
       
   440 
       
   441     def getmode(self, file, rev):        
       
   442         return self.modecache[(file, rev)]
       
   443 
       
   444     def getchanges(self, rev):
       
   445         self.modecache = {}
       
   446         files = self.files[rev]
       
   447         cl = files
       
   448         cl.sort()
       
   449         return cl
       
   450 
       
   451     def getcommit(self, rev):
       
   452         return self.commits[rev]
       
   453 
       
   454     def gettags(self):
       
   455         return []
       
   456 
       
   457     def _find_children(self, path, revnum):
       
   458         path = path.strip("/")
       
   459 
       
   460         def _find_children_fallback(path, revnum):
       
   461             # SWIG python bindings for getdir are broken up to at least 1.4.3
       
   462             if not hasattr(self, 'client_ctx'):
       
   463                 self.client_ctx = svn.client.create_context()
       
   464             optrev = svn.core.svn_opt_revision_t()
       
   465             optrev.kind = svn.core.svn_opt_revision_number
       
   466             optrev.value.number = revnum
       
   467             rpath = '/'.join([self.url, path]).strip('/')
       
   468             return ['%s/%s' % (path, x) for x in svn.client.ls(rpath, optrev, True, self.client_ctx).keys()]
       
   469 
       
   470         if hasattr(self, '_find_children_fallback'):
       
   471             return _find_children_fallback(path, revnum)
       
   472 
       
   473         self.reparent("/" + path)
       
   474         pool = Pool()
       
   475 
       
   476         children = []
       
   477         def find_children_inner(children, path, revnum = revnum):
       
   478             if hasattr(svn.ra, 'get_dir2'): # Since SVN 1.4
       
   479                 fields = 0xffffffff # Binding does not provide SVN_DIRENT_ALL
       
   480                 getdir = svn.ra.get_dir2(self.ra, path, revnum, fields, pool)
       
   481             else:
       
   482                 getdir = svn.ra.get_dir(self.ra, path, revnum, pool)
       
   483             if type(getdir) == dict:
       
   484                 # python binding for getdir is broken up to at least 1.4.3
       
   485                 raise CompatibilityException()
       
   486             dirents = getdir[0]
       
   487             if type(dirents) == int:
       
   488                 # got here once due to infinite recursion bug
       
   489                 # pprint.pprint(getdir)
       
   490                 return
       
   491             c = dirents.keys()
       
   492             c.sort()
       
   493             for child in c:
       
   494                 dirent = dirents[child]
       
   495                 if dirent.kind == svn.core.svn_node_dir:
       
   496                     find_children_inner(children, (path + "/" + child).strip("/"))
       
   497                 else:
       
   498                     children.append((path + "/" + child).strip("/"))
       
   499 
       
   500         try:
       
   501             find_children_inner(children, "")
       
   502         except CompatibilityException:
       
   503             self._find_children_fallback = True
       
   504             self.reparent(self.module)
       
   505             return _find_children_fallback(path, revnum)
       
   506 
       
   507         self.reparent(self.module)
       
   508         return [path + "/" + c for c in children]
       
   509 
       
   510     def recode(self, s):
       
   511         return recode(self.encoding, s)