# HG changeset patch # User Matt Mackall # Date 1165184193 21600 # Node ID f96c158ea3a3920b3b45cfd7da8b80b6efe757a1 # Parent 96095d9ff1f82c74304dd4c78d1ae4928dd4034a Add functions for transcoding and manipulating multibyte strings diff --git a/mercurial/util.py b/mercurial/util.py --- a/mercurial/util.py +++ b/mercurial/util.py @@ -18,6 +18,58 @@ demandload(globals(), "cStringIO errno g demandload(globals(), "os threading time calendar ConfigParser locale") _encoding = os.environ.get("HGENCODING") or locale.getpreferredencoding() +_encodingmode = os.environ.get("HGENCODINGMODE", "strict") + +def tolocal(s): + """ + Convert a string from internal UTF-8 to local encoding + + All internal strings should be UTF-8 but some repos before the + implementation of locale support may contain latin1 or possibly + other character sets. We attempt to decode everything strictly + using UTF-8, then Latin-1, and failing that, we use UTF-8 and + replace unknown characters. + """ + for e in "utf-8 latin1".split(): + try: + u = s.decode(e) # attempt strict decoding + return u.encode(_encoding, "replace") + except UnicodeDecodeError: + pass + u = s.decode("utf-8", "replace") # last ditch + return u.encode(_encoding, "replace") + +def fromlocal(s): + """ + Convert a string from the local character encoding to UTF-8 + + We attempt to decode strings using the encoding mode set by + HG_ENCODINGMODE, which defaults to 'strict'. In this mode, unknown + characters will cause an error message. Other modes include + 'replace', which replaces unknown characters with a special + Unicode character, and 'ignore', which drops the character. + """ + try: + return s.decode(_encoding, _encodingmode).encode("utf-8") + except UnicodeDecodeError, inst: + sub = s[max(0, inst.start-10):inst.start+10] + raise Abort("decoding near '%s': %s!\n" % (sub, inst)) + +def locallen(s): + """Find the length in characters of a local string""" + return len(s.decode(_encoding, "replace")) + +def localsub(s, a, b=None): + try: + u = s.decode(_encoding, _encodingmode) + if b is not None: + u = u[a:b] + else: + u = u[:a] + return u.encode(_encoding, _encodingmode) + except UnicodeDecodeError, inst: + sub = s[max(0, inst.start-10), inst.start+10] + raise Abort("decoding near '%s': %s!\n" % (sub, inst)) # used by parsedate defaultdateformats = ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M',