summaryrefslogtreecommitdiffstats
path: root/powerline/lint/markedjson/scanner.py
blob: b0bddf388e371a4ffc77259720a8fb75d268ec9d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
# vim:fileencoding=utf-8:noet
from __future__ import (unicode_literals, division, absolute_import, print_function)

from string import hexdigits

from powerline.lint.markedjson.error import MarkedError
from powerline.lint.markedjson import tokens
from powerline.lib.unicode import unicode, unichr, surrogate_pair_to_character


hexdigits_set = set(hexdigits)


# Scanner produces tokens of the following types:
# STREAM-START
# STREAM-END
# DOCUMENT-START
# DOCUMENT-END
# FLOW-SEQUENCE-START
# FLOW-MAPPING-START
# FLOW-SEQUENCE-END
# FLOW-MAPPING-END
# FLOW-ENTRY
# KEY
# VALUE
# SCALAR(value, plain, style)
#
# Read comments in the Scanner code for more details.


class ScannerError(MarkedError):
	pass


class SimpleKey:
	# See below simple keys treatment.
	def __init__(self, token_number, index, line, column, mark):
		self.token_number = token_number
		self.index = index
		self.line = line
		self.column = column
		self.mark = mark


class Scanner:
	def __init__(self):
		'''Initialize the scanner.'''
		# It is assumed that Scanner and Reader will have a common descendant.
		# Reader do the dirty work of checking for BOM and converting the
		# input data to Unicode. It also adds NUL to the end.
		#
		# Reader supports the following methods
		# 	self.peek(i=0)		 # peek the next i-th character
		# 	self.prefix(l=1)	 # peek the next l characters
		# 	self.forward(l=1)	 # read the next l characters and move the pointer.

		# Had we reached the end of the stream?
		self.done = False

		# The number of unclosed '{' and '['. `flow_level == 0` means block
		# context.
		self.flow_level = 0

		# List of processed tokens that are not yet emitted.
		self.tokens = []

		# Add the STREAM-START token.
		self.fetch_stream_start()

		# Number of tokens that were emitted through the `get_token` method.
		self.tokens_taken = 0

		# Variables related to simple keys treatment.

		# A simple key is a key that is not denoted by the '?' indicator.
		# We emit the KEY token before all keys, so when we find a potential
		# simple key, we try to locate the corresponding ':' indicator.
		# Simple keys should be limited to a single line.

		# Can a simple key start at the current position? A simple key may
		# start:
		# - after '{', '[', ',' (in the flow context),
		self.allow_simple_key = False

		# Keep track of possible simple keys. This is a dictionary. The key
		# is `flow_level`; there can be no more that one possible simple key
		# for each level. The value is a SimpleKey record:
		# 	(token_number, index, line, column, mark)
		# A simple key may start with SCALAR(flow), '[', or '{' tokens.
		self.possible_simple_keys = {}

	# Public methods.

	def check_token(self, *choices):
		# Check if the next token is one of the given types.
		while self.need_more_tokens():
			self.fetch_more_tokens()
		if self.tokens:
			if not choices:
				return True
			for choice in choices:
				if isinstance(self.tokens[0], choice):
					return True
		return False

	def peek_token(self):
		# Return the next token, but do not delete if from the queue.
		while self.need_more_tokens():
			self.fetch_more_tokens()
		if self.tokens:
			return self.tokens[0]

	def get_token(self):
		# Return the next token.
		while self.need_more_tokens():
			self.fetch_more_tokens()
		if self.tokens:
			self.tokens_taken += 1
			return self.tokens.pop(0)

	# Private methods.

	def need_more_tokens(self):
		if self.done:
			return False
		if not self.tokens:
			return True
		# The current token may be a potential simple key, so we
		# need to look further.
		self.stale_possible_simple_keys()
		if self.next_possible_simple_key() == self.tokens_taken:
			return True

	def fetch_more_tokens(self):

		# Eat whitespaces and comments until we reach the next token.
		self.scan_to_next_token()

		# Remove obsolete possible simple keys.
		self.stale_possible_simple_keys()

		# Peek the next character.
		ch = self.peek()

		# Is it the end of stream?
		if ch == '\0':
			return self.fetch_stream_end()

		# Note: the order of the following checks is NOT significant.

		# Is it the flow sequence start indicator?
		if ch == '[':
			return self.fetch_flow_sequence_start()

		# Is it the flow mapping start indicator?
		if ch == '{':
			return self.fetch_flow_mapping_start()

		# Is it the flow sequence end indicator?
		if ch == ']':
			return self.fetch_flow_sequence_end()

		# Is it the flow mapping end indicator?
		if ch == '}':
			return self.fetch_flow_mapping_end()

		# Is it the flow entry indicator?
		if ch == ',':
			return self.fetch_flow_entry()

		# Is it the value indicator?
		if ch == ':' and self.flow_level:
			return self.fetch_value()

		# Is it a double quoted scalar?
		if ch == '"':
			return self.fetch_double()

		# It must be a plain scalar then.
		if self.check_plain():
			return self.fetch_plain()

		# No? It’s an error. Let’s produce a nice error message.
		raise ScannerError(
			'while scanning for the next token', None,
			'found character %r that cannot start any token' % ch,
			self.get_mark()
		)

	# Simple keys treatment.

	def next_possible_simple_key(self):
		# Return the number of the nearest possible simple key. Actually we
		# don’t need to loop through the whole dictionary. We may replace it
		# with the following code:
		# 	if not self.possible_simple_keys:
		# 		return None
		# 	return self.possible_simple_keys[
		# 			min(self.possible_simple_keys.keys())].token_number
		min_token_number = None
		for level in self.possible_simple_keys:
			key = self.possible_simple_keys[level]
			if min_token_number is None or key.token_number < min_token_number:
				min_token_number = key.token_number
		return min_token_number

	def stale_possible_simple_keys(self):
		# Remove entries that are no longer possible simple keys. According to
		# the YAML specification, simple keys
		# - should be limited to a single line,
		# Disabling this procedure will allow simple keys of any length and
		# height (may cause problems if indentation is broken though).
		for level in list(self.possible_simple_keys):
			key = self.possible_simple_keys[level]
			if key.line != self.line:
				del self.possible_simple_keys[level]

	def save_possible_simple_key(self):
		# The next token may start a simple key. We check if it’s possible
		# and save its position. This function is called for
		# 	SCALAR(flow), '[', and '{'.

		# The next token might be a simple key. Let’s save it’s number and
		# position.
		if self.allow_simple_key:
			self.remove_possible_simple_key()
			token_number = self.tokens_taken + len(self.tokens)
			key = SimpleKey(token_number, self.index, self.line, self.column, self.get_mark())
			self.possible_simple_keys[self.flow_level] = key

	def remove_possible_simple_key(self):
		# Remove the saved possible key position at the current flow level.
		if self.flow_level in self.possible_simple_keys:
			del self.possible_simple_keys[self.flow_level]

	# Fetchers.

	def fetch_stream_start(self):
		# We always add STREAM-START as the first token and STREAM-END as the
		# last token.

		# Read the token.
		mark = self.get_mark()

		# Add STREAM-START.
		self.tokens.append(tokens.StreamStartToken(mark, mark, encoding=self.encoding))

	def fetch_stream_end(self):
		# Reset simple keys.
		self.remove_possible_simple_key()
		self.allow_simple_key = False
		self.possible_simple_keys = {}

		# Read the token.
		mark = self.get_mark()

		# Add STREAM-END.
		self.tokens.append(tokens.StreamEndToken(mark, mark))

		# The steam is finished.
		self.done = True

	def fetch_flow_sequence_start(self):
		self.fetch_flow_collection_start(tokens.FlowSequenceStartToken)

	def fetch_flow_mapping_start(self):
		self.fetch_flow_collection_start(tokens.FlowMappingStartToken)

	def fetch_flow_collection_start(self, TokenClass):
		# '[' and '{' may start a simple key.
		self.save_possible_simple_key()

		# Increase the flow level.
		self.flow_level += 1

		# Simple keys are allowed after '[' and '{'.
		self.allow_simple_key = True

		# Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
		start_mark = self.get_mark()
		self.forward()
		end_mark = self.get_mark()
		self.tokens.append(TokenClass(start_mark, end_mark))

	def fetch_flow_sequence_end(self):
		self.fetch_flow_collection_end(tokens.FlowSequenceEndToken)

	def fetch_flow_mapping_end(self):
		self.fetch_flow_collection_end(tokens.FlowMappingEndToken)

	def fetch_flow_collection_end(self, TokenClass):
		# Reset possible simple key on the current level.
		self.remove_possible_simple_key()

		# Decrease the flow level.
		self.flow_level -= 1

		# No simple keys after ']' or '}'.
		self.allow_simple_key = False

		# Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
		start_mark = self.get_mark()
		self.forward()
		end_mark = self.get_mark()
		self.tokens.append(TokenClass(start_mark, end_mark))

	def fetch_value(self):
		# Do we determine a simple key?
		if self.flow_level in self.possible_simple_keys:

			# Add KEY.
			key = self.possible_simple_keys[self.flow_level]
			del self.possible_simple_keys[self.flow_level]
			self.tokens.insert(key.token_number - self.tokens_taken, tokens.KeyToken(key.mark, key.mark))

			# There cannot be two simple keys one after another.
			self.allow_simple_key = False

		# Add VALUE.
		start_mark = self.get_mark()
		self.forward()
		end_mark = self.get_mark()
		self.tokens.append(tokens.ValueToken(start_mark, end_mark))

	def fetch_flow_entry(self):
		# Simple keys are allowed after ','.
		self.allow_simple_key = True

		# Reset possible simple key on the current level.
		self.remove_possible_simple_key()

		# Add FLOW-ENTRY.
		start_mark = self.get_mark()
		self.forward()
		end_mark = self.get_mark()
		self.tokens.append(tokens.FlowEntryToken(start_mark, end_mark))

	def fetch_double(self):
		# A flow scalar could be a simple key.
		self.save_possible_simple_key()

		# No simple keys after flow scalars.
		self.allow_simple_key = False

		# Scan and add SCALAR.
		self.tokens.append(self.scan_flow_scalar())

	def fetch_plain(self):

		self.save_possible_simple_key()

		# No simple keys after plain scalars.
		self.allow_simple_key = False

		# Scan and add SCALAR. May change `allow_simple_key`.
		self.tokens.append(self.scan_plain())

	# Checkers.

	def check_plain(self):
		return self.peek() in '0123456789-ntf'

	# Scanners.

	def scan_to_next_token(self):
		while self.peek() in ' \t\n':
			self.forward()

	def scan_flow_scalar(self):
		# See the specification for details.
		# Note that we loose indentation rules for quoted scalars. Quoted
		# scalars don’t need to adhere indentation because " and ' clearly
		# mark the beginning and the end of them. Therefore we are less
		# restrictive then the specification requires. We only need to check
		# that document separators are not included in scalars.
		chunks = []
		start_mark = self.get_mark()
		quote = self.peek()
		self.forward()
		chunks.extend(self.scan_flow_scalar_non_spaces(start_mark))
		while self.peek() != quote:
			chunks.extend(self.scan_flow_scalar_spaces(start_mark))
			chunks.extend(self.scan_flow_scalar_non_spaces(start_mark))
		self.forward()
		end_mark = self.get_mark()
		return tokens.ScalarToken(unicode().join(chunks), False, start_mark, end_mark, '"')

	ESCAPE_REPLACEMENTS = {
		'b': '\x08',
		't': '\x09',
		'n': '\x0A',
		'f': '\x0C',
		'r': '\x0D',
		'"': '\"',
		'\\': '\\',
	}

	ESCAPE_CODES = {
		'u': 4,
	}

	def scan_flow_scalar_non_spaces(self, start_mark):
		# See the specification for details.
		chunks = []
		while True:
			length = 0
			while self.peek(length) not in '\"\\\0 \t\n':
				length += 1
			if length:
				chunks.append(self.prefix(length))
				self.forward(length)
			ch = self.peek()
			if ch == '\\':
				self.forward()
				ch = self.peek()
				if ch in self.ESCAPE_REPLACEMENTS:
					chunks.append(self.ESCAPE_REPLACEMENTS[ch])
					self.forward()
				elif ch in self.ESCAPE_CODES:
					length = self.ESCAPE_CODES[ch]
					self.forward()
					for k in range(length):
						if self.peek(k) not in hexdigits:
							raise ScannerError(
								'while scanning a double-quoted scalar', start_mark,
								'expected escape sequence of %d hexdecimal numbers, but found %r' % (
									length, self.peek(k)),
								self.get_mark()
							)
					code = int(self.prefix(length), 16)
					self.forward(length)
					if 0xD800 <= code <= 0xDC00:
						# Start of the surrogate pair
						next_char = self.prefix(6)
						if (
							next_char[0] != '\\'
							or next_char[1] != 'u'
							or not (set(next_char[2:]) < hexdigits_set)
							or not (0xDC00 <= int(next_char[2:], 16) <= 0xDFFF)
						):
							raise ScannerError(
								'while scanning a double-quoted scalar', start_mark,
								'expected escape sequence with the next character in surrogate pair, but found %r' % (
									next_char
								),
								self.get_mark()
							)
						code = surrogate_pair_to_character(code, int(next_char[2:], 16))
						self.forward(6)
					chunks.append(unichr(code))
				else:
					raise ScannerError(
						'while scanning a double-quoted scalar', start_mark,
						('found unknown escape character %r' % ch), self.get_mark()
					)
			else:
				return chunks

	def scan_flow_scalar_spaces(self, start_mark):
		# See the specification for details.
		chunks = []
		length = 0
		while self.peek(length) in ' \t':
			length += 1
		whitespaces = self.prefix(length)
		self.forward(length)
		ch = self.peek()
		if ch == '\0':
			raise ScannerError(
				'while scanning a quoted scalar', start_mark,
				'found unexpected end of stream', self.get_mark()
			)
		elif ch == '\n':
			raise ScannerError(
				'while scanning a quoted scalar', start_mark,
				'found unexpected line end', self.get_mark()
			)
		else:
			chunks.append(whitespaces)
		return chunks

	def scan_plain(self):
		chunks = []
		start_mark = self.get_mark()
		spaces = []
		while True:
			length = 0
			while True:
				if self.peek(length) not in 'eE.0123456789nul-tr+fas':
					break
				length += 1
			if length == 0:
				break
			self.allow_simple_key = False
			chunks.extend(spaces)
			chunks.append(self.prefix(length))
			self.forward(length)
		end_mark = self.get_mark()
		return tokens.ScalarToken(''.join(chunks), True, start_mark, end_mark)