comparison mercurial/util.py @ 3771:f96c158ea3a3

Add functions for transcoding and manipulating multibyte strings
author Matt Mackall <mpm@selenic.com>
date Sun, 03 Dec 2006 16:16:33 -0600
parents 96095d9ff1f8
children 1427949b8f80
comparison
equal deleted inserted replaced
3770:96095d9ff1f8 3771:f96c158ea3a3
16 from demandload import * 16 from demandload import *
17 demandload(globals(), "cStringIO errno getpass popen2 re shutil sys tempfile") 17 demandload(globals(), "cStringIO errno getpass popen2 re shutil sys tempfile")
18 demandload(globals(), "os threading time calendar ConfigParser locale") 18 demandload(globals(), "os threading time calendar ConfigParser locale")
19 19
20 _encoding = os.environ.get("HGENCODING") or locale.getpreferredencoding() 20 _encoding = os.environ.get("HGENCODING") or locale.getpreferredencoding()
21 _encodingmode = os.environ.get("HGENCODINGMODE", "strict")
22
23 def tolocal(s):
24 """
25 Convert a string from internal UTF-8 to local encoding
26
27 All internal strings should be UTF-8 but some repos before the
28 implementation of locale support may contain latin1 or possibly
29 other character sets. We attempt to decode everything strictly
30 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
31 replace unknown characters.
32 """
33 for e in "utf-8 latin1".split():
34 try:
35 u = s.decode(e) # attempt strict decoding
36 return u.encode(_encoding, "replace")
37 except UnicodeDecodeError:
38 pass
39 u = s.decode("utf-8", "replace") # last ditch
40 return u.encode(_encoding, "replace")
41
42 def fromlocal(s):
43 """
44 Convert a string from the local character encoding to UTF-8
45
46 We attempt to decode strings using the encoding mode set by
47 HG_ENCODINGMODE, which defaults to 'strict'. In this mode, unknown
48 characters will cause an error message. Other modes include
49 'replace', which replaces unknown characters with a special
50 Unicode character, and 'ignore', which drops the character.
51 """
52 try:
53 return s.decode(_encoding, _encodingmode).encode("utf-8")
54 except UnicodeDecodeError, inst:
55 sub = s[max(0, inst.start-10):inst.start+10]
56 raise Abort("decoding near '%s': %s!\n" % (sub, inst))
57
58 def locallen(s):
59 """Find the length in characters of a local string"""
60 return len(s.decode(_encoding, "replace"))
61
62 def localsub(s, a, b=None):
63 try:
64 u = s.decode(_encoding, _encodingmode)
65 if b is not None:
66 u = u[a:b]
67 else:
68 u = u[:a]
69 return u.encode(_encoding, _encodingmode)
70 except UnicodeDecodeError, inst:
71 sub = s[max(0, inst.start-10), inst.start+10]
72 raise Abort("decoding near '%s': %s!\n" % (sub, inst))
21 73
22 # used by parsedate 74 # used by parsedate
23 defaultdateformats = ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M', 75 defaultdateformats = ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M',
24 '%a %b %d %H:%M:%S %Y') 76 '%a %b %d %H:%M:%S %Y')
25 77