summaryrefslogtreecommitdiffstats
path: root/powerline/lib/unicode.py
blob: eeae387164aea9e01515639f7c8e0a3e1980ef75 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# vim:fileencoding=utf-8:noet
from __future__ import (unicode_literals, division, absolute_import, print_function)

import sys
import codecs

from unicodedata import east_asian_width, combining

from powerline.lib.encoding import get_preferred_output_encoding


try:
	from __builtin__ import unicode
except ImportError:
	unicode = str


try:
	from __builtin__ import unichr
except ImportError:
	unichr = chr


if sys.maxunicode < 0x10FFFF:
	_unichr = unichr

	def unichr(ch):
		if ch <= sys.maxunicode:
			return _unichr(ch)
		else:
			ch -= 0x10000
			return _unichr((ch >> 10) + 0xD800) + _unichr((ch & ((1 << 10) - 1)) + 0xDC00)


def u(s):
	'''Return unicode instance assuming UTF-8 encoded string.
	'''
	if type(s) is unicode:
		return s
	else:
		return unicode(s, 'utf-8')


if sys.version_info < (3,):
	def tointiter(s):
		'''Convert a byte string to the sequence of integers
		'''
		return (ord(c) for c in s)
else:
	def tointiter(s):
		'''Convert a byte string to the sequence of integers
		'''
		return iter(s)


def powerline_decode_error(e):
	if not isinstance(e, UnicodeDecodeError):
		raise NotImplementedError
	return (''.join((
		'<{0:02X}>'.format(c)
		for c in tointiter(e.object[e.start:e.end])
	)), e.end)


codecs.register_error('powerline_decode_error', powerline_decode_error)


last_swe_idx = 0


def register_strwidth_error(strwidth):
	'''Create new encode errors handling method similar to ``replace``

	Like ``replace`` this method uses question marks in place of the characters 
	that cannot be represented in the requested encoding. Unlike ``replace`` the 
	amount of question marks is identical to the amount of display cells 
	offending character occupies. Thus encoding ``…`` (U+2026, HORIZONTAL 
	ELLIPSIS) to ``latin1`` will emit one question mark, but encoding ``A`` 
	(U+FF21, FULLWIDTH LATIN CAPITAL LETTER A) will emit two question marks.

	Since width of some characters depends on the terminal settings and 
	powerline knows how to respect them a single error handling method cannot be 
	used. Instead of it the generator function is used which takes ``strwidth`` 
	function (function that knows how to compute string width respecting all 
	needed settings) and emits new error handling method name.

	:param function strwidth:
		Function that computs string width measured in display cells the string 
		occupies when displayed.

	:return: New error handling method name.
	'''
	global last_swe_idx
	last_swe_idx += 1

	def powerline_encode_strwidth_error(e):
		if not isinstance(e, UnicodeEncodeError):
			raise NotImplementedError
		return ('?' * strwidth(e.object[e.start:e.end]), e.end)

	ename = 'powerline_encode_strwidth_error_{0}'.format(last_swe_idx)
	codecs.register_error(ename, powerline_encode_strwidth_error)
	return ename


def out_u(s):
	'''Return unicode string suitable for displaying

	Unlike other functions assumes get_preferred_output_encoding() first. Unlike 
	u() does not throw exceptions for invalid unicode strings. Unlike 
	safe_unicode() does throw an exception if object is not a string.
	'''
	if isinstance(s, unicode):
		return s
	elif isinstance(s, bytes):
		return unicode(s, get_preferred_output_encoding(), 'powerline_decode_error')
	else:
		raise TypeError('Expected unicode or bytes instance, got {0}'.format(repr(type(s))))


def safe_unicode(s):
	'''Return unicode instance without raising an exception.

	Order of assumptions:
	* ASCII string or unicode object
	* UTF-8 string
	* Object with __str__() or __repr__() method that returns UTF-8 string or 
	  unicode object (depending on python version)
	* String in powerline.lib.encoding.get_preferred_output_encoding() encoding
	* If everything failed use safe_unicode on last exception with which 
	  everything failed
	'''
	try:
		try:
			if type(s) is bytes:
				return unicode(s, 'ascii')
			else:
				return unicode(s)
		except UnicodeDecodeError:
			try:
				return unicode(s, 'utf-8')
			except TypeError:
				return unicode(str(s), 'utf-8')
			except UnicodeDecodeError:
				return unicode(s, get_preferred_output_encoding())
	except Exception as e:
		return safe_unicode(e)


class FailedUnicode(unicode):
	'''Builtin ``unicode`` subclass indicating fatal error

	If your code for some reason wants to determine whether `.render()` method 
	failed it should check returned string for being a FailedUnicode instance. 
	Alternatively you could subclass Powerline and override `.render()` method 
	to do what you like in place of catching the exception and returning 
	FailedUnicode.
	'''
	pass


if sys.version_info < (3,):
	def string(s):
		if type(s) is not str:
			return s.encode('utf-8')
		else:
			return s
else:
	def string(s):
		if type(s) is not str:
			return s.decode('utf-8')
		else:
			return s


string.__doc__ = (
	'''Transform ``unicode`` or ``bytes`` object into ``str`` object

	On Python-2 this encodes ``unicode`` to ``bytes`` (which is ``str``) using 
	UTF-8 encoding; on Python-3 this decodes ``bytes`` to ``unicode`` (which is 
	``str``) using UTF-8 encoding.

	Useful for functions that expect an ``str`` object in both unicode versions, 
	not caring about the semantic differences between them in Python-2 and 
	Python-3.
	'''
)


def surrogate_pair_to_character(high, low):
	'''Transform a pair of surrogate codepoints to one codepoint
	'''
	return 0x10000 + ((high - 0xD800) << 10) + (low - 0xDC00)


_strwidth_documentation = (
	'''Compute string width in display cells

	{0}

	:param dict width_data:
		Dictionary which maps east_asian_width property values to strings 
		lengths. It is expected to contain the following keys and values (from 
		`East Asian Width annex <http://www.unicode.org/reports/tr11/>`_):

		===  ======  ===========================================================
		Key  Value   Description
		===  ======  ===========================================================
		F    2       Fullwidth: all characters that are defined as Fullwidth in 
		             the Unicode Standard [Unicode] by having a compatibility 
		             decomposition of type <wide> to characters elsewhere in the 
		             Unicode Standard that are implicitly narrow but unmarked.
		H    1       Halfwidth: all characters that are explicitly defined as 
		             Halfwidth in the Unicode Standard by having a compatibility 
		             decomposition of type <narrow> to characters elsewhere in 
		             the Unicode Standard that are implicitly wide but unmarked, 
		             plus U+20A9 ₩ WON SIGN.
		W    2       Wide: all other characters that are always wide. These 
		             characters occur only in the context of East Asian 
		             typography where they are wide characters (such as the 
		             Unified Han Ideographs or Squared Katakana Symbols). This 
		             category includes characters that have explicit halfwidth 
		             counterparts.
		Na   1       Narrow: characters that are always narrow and have explicit 
		             fullwidth or wide counterparts. These characters are 
		             implicitly narrow in East Asian typography and legacy 
		             character sets because they have explicit fullwidth or wide 
		             counterparts. All of ASCII is an example of East Asian 
		             Narrow characters.
		A    1 or 2  Ambiguous: characters that may sometimes be wide and
		             sometimes narrow. Ambiguous characters require additional 
		             information not contained in the character code to further 
		             resolve their width. This information is usually defined in 
		             terminal setting that should in turn respect glyphs widths 
		             in used fonts. Also see :ref:`ambiwidth configuration 
		             option <config-common-ambiwidth>`.
		N    1       Neutral characters: character that does not occur in legacy 
		             East Asian character sets.
		===  ======  ===========================================================

	:param unicode string:
		String whose width will be calculated.

	:return: unsigned integer.''')


def strwidth_ucs_4(width_data, string):
	return sum(((
		(
			0
		) if combining(symbol) else (
			width_data[east_asian_width(symbol)]
		)
	) for symbol in string))


strwidth_ucs_4.__doc__ = _strwidth_documentation.format(
	'''This version of function expects that characters above 0xFFFF are 
	represented using one symbol. This is only the case in UCS-4 Python builds.

	.. note:
		Even in UCS-4 Python builds it is possible to represent characters above 
		0xFFFF using surrogate pairs. Characters represented this way are not 
		supported.''')


def strwidth_ucs_2(width_data, string):
	return sum(((
		(
			width_data[east_asian_width(string[i - 1] + symbol)]
		) if 0xDC00 <= ord(symbol) <= 0xDFFF else (
			0
		) if combining(symbol) or 0xD800 <= ord(symbol) <= 0xDBFF else (
			width_data[east_asian_width(symbol)]
		)
	) for i, symbol in enumerate(string)))


strwidth_ucs_2.__doc__ = _strwidth_documentation.format(
	'''This version of function expects that characters above 0xFFFF are 
	represented using two symbols forming a surrogate pair, which is the only 
	option in UCS-2 Python builds. It still works correctly in UCS-4 Python 
	builds, but is slower then its UCS-4 counterpart.''')