diff options
Diffstat (limited to 'lib/ruyaml/scanner.py')
-rw-r--r-- | lib/ruyaml/scanner.py | 2491 |
1 files changed, 2491 insertions, 0 deletions
diff --git a/lib/ruyaml/scanner.py b/lib/ruyaml/scanner.py new file mode 100644 index 0000000..a5a81dc --- /dev/null +++ b/lib/ruyaml/scanner.py @@ -0,0 +1,2491 @@ +# coding: utf-8 + +# Scanner produces tokens of the following types: +# STREAM-START +# STREAM-END +# DIRECTIVE(name, value) +# DOCUMENT-START +# DOCUMENT-END +# BLOCK-SEQUENCE-START +# BLOCK-MAPPING-START +# BLOCK-END +# FLOW-SEQUENCE-START +# FLOW-MAPPING-START +# FLOW-SEQUENCE-END +# FLOW-MAPPING-END +# BLOCK-ENTRY +# FLOW-ENTRY +# KEY +# VALUE +# ALIAS(value) +# ANCHOR(value) +# TAG(value) +# SCALAR(value, plain, style) +# +# RoundTripScanner +# COMMENT(value) +# +# Read comments in the Scanner code for more details. +# + +import inspect + +from ruyaml.compat import _F, check_anchorname_char, nprint, nprintf # NOQA +from ruyaml.error import CommentMark, MarkedYAMLError # NOQA +from ruyaml.tokens import * # NOQA + +if False: # MYPY + from typing import Any, Dict, List, Optional, Text, Union # NOQA + + from ruyaml.compat import VersionType # NOQA + +__all__ = ['Scanner', 'RoundTripScanner', 'ScannerError'] + + +_THE_END = '\n\0\r\x85\u2028\u2029' +_THE_END_SPACE_TAB = ' \n\0\t\r\x85\u2028\u2029' +_SPACE_TAB = ' \t' + + +def xprintf(*args, **kw): + # type: (Any, Any) -> Any + return nprintf(*args, **kw) + pass + + +class ScannerError(MarkedYAMLError): + pass + + +class SimpleKey: + # See below simple keys treatment. + + def __init__(self, token_number, required, index, line, column, mark): + # type: (Any, Any, int, int, int, Any) -> None + self.token_number = token_number + self.required = required + self.index = index + self.line = line + self.column = column + self.mark = mark + + +class Scanner: + def __init__(self, loader=None): + # type: (Any) -> None + """Initialize the scanner.""" + # It is assumed that Scanner and Reader will have a common descendant. + # Reader do the dirty work of checking for BOM and converting the + # input data to Unicode. It also adds NUL to the end. + # + # Reader supports the following methods + # self.peek(i=0) # peek the next i-th character + # self.prefix(l=1) # peek the next l characters + # self.forward(l=1) # read the next l characters and move the pointer + + self.loader = loader + if self.loader is not None and getattr(self.loader, '_scanner', None) is None: + self.loader._scanner = self + self.reset_scanner() + self.first_time = False + self.yaml_version = None # type: Any + + @property + def flow_level(self): + # type: () -> int + return len(self.flow_context) + + def reset_scanner(self): + # type: () -> None + # Had we reached the end of the stream? + self.done = False + + # flow_context is an expanding/shrinking list consisting of '{' and '[' + # for each unclosed flow context. If empty list that means block context + self.flow_context = [] # type: List[Text] + + # List of processed tokens that are not yet emitted. + self.tokens = [] # type: List[Any] + + # Add the STREAM-START token. + self.fetch_stream_start() + + # Number of tokens that were emitted through the `get_token` method. + self.tokens_taken = 0 + + # The current indentation level. + self.indent = -1 + + # Past indentation levels. + self.indents = [] # type: List[int] + + # Variables related to simple keys treatment. + + # A simple key is a key that is not denoted by the '?' indicator. + # Example of simple keys: + # --- + # block simple key: value + # ? not a simple key: + # : { flow simple key: value } + # We emit the KEY token before all keys, so when we find a potential + # simple key, we try to locate the corresponding ':' indicator. + # Simple keys should be limited to a single line and 1024 characters. + + # Can a simple key start at the current position? A simple key may + # start: + # - at the beginning of the line, not counting indentation spaces + # (in block context), + # - after '{', '[', ',' (in the flow context), + # - after '?', ':', '-' (in the block context). + # In the block context, this flag also signifies if a block collection + # may start at the current position. + self.allow_simple_key = True + + # Keep track of possible simple keys. This is a dictionary. The key + # is `flow_level`; there can be no more that one possible simple key + # for each level. The value is a SimpleKey record: + # (token_number, required, index, line, column, mark) + # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow), + # '[', or '{' tokens. + self.possible_simple_keys = {} # type: Dict[Any, Any] + + @property + def reader(self): + # type: () -> Any + try: + return self._scanner_reader # type: ignore + except AttributeError: + if hasattr(self.loader, 'typ'): + self._scanner_reader = self.loader.reader # type: ignore + else: + self._scanner_reader = self.loader._reader # type: ignore + return self._scanner_reader + + @property + def scanner_processing_version(self): # prefix until un-composited + # type: () -> Any + if hasattr(self.loader, 'typ'): + return self.loader.resolver.processing_version # type: ignore + return self.loader.processing_version # type: ignore + + # Public methods. + + def check_token(self, *choices): + # type: (Any) -> bool + # Check if the next token is one of the given types. + while self.need_more_tokens(): + self.fetch_more_tokens() + if len(self.tokens) > 0: + if not choices: + return True + for choice in choices: + if isinstance(self.tokens[0], choice): + return True + return False + + def peek_token(self): + # type: () -> Any + # Return the next token, but do not delete if from the queue. + while self.need_more_tokens(): + self.fetch_more_tokens() + if len(self.tokens) > 0: + return self.tokens[0] + + def get_token(self): + # type: () -> Any + # Return the next token. + while self.need_more_tokens(): + self.fetch_more_tokens() + if len(self.tokens) > 0: + self.tokens_taken += 1 + return self.tokens.pop(0) + + # Private methods. + + def need_more_tokens(self): + # type: () -> bool + if self.done: + return False + if len(self.tokens) == 0: + return True + # The current token may be a potential simple key, so we + # need to look further. + self.stale_possible_simple_keys() + if self.next_possible_simple_key() == self.tokens_taken: + return True + return False + + def fetch_comment(self, comment): + # type: (Any) -> None + raise NotImplementedError + + def fetch_more_tokens(self): + # type: () -> Any + # Eat whitespaces and comments until we reach the next token. + comment = self.scan_to_next_token() + if comment is not None: # never happens for base scanner + return self.fetch_comment(comment) + # Remove obsolete possible simple keys. + self.stale_possible_simple_keys() + + # Compare the current indentation and column. It may add some tokens + # and decrease the current indentation level. + self.unwind_indent(self.reader.column) + + # Peek the next character. + ch = self.reader.peek() + + # Is it the end of stream? + if ch == '\0': + return self.fetch_stream_end() + + # Is it a directive? + if ch == '%' and self.check_directive(): + return self.fetch_directive() + + # Is it the document start? + if ch == '-' and self.check_document_start(): + return self.fetch_document_start() + + # Is it the document end? + if ch == '.' and self.check_document_end(): + return self.fetch_document_end() + + # TODO: support for BOM within a stream. + # if ch == '\uFEFF': + # return self.fetch_bom() <-- issue BOMToken + + # Note: the order of the following checks is NOT significant. + + # Is it the flow sequence start indicator? + if ch == '[': + return self.fetch_flow_sequence_start() + + # Is it the flow mapping start indicator? + if ch == '{': + return self.fetch_flow_mapping_start() + + # Is it the flow sequence end indicator? + if ch == ']': + return self.fetch_flow_sequence_end() + + # Is it the flow mapping end indicator? + if ch == '}': + return self.fetch_flow_mapping_end() + + # Is it the flow entry indicator? + if ch == ',': + return self.fetch_flow_entry() + + # Is it the block entry indicator? + if ch == '-' and self.check_block_entry(): + return self.fetch_block_entry() + + # Is it the key indicator? + if ch == '?' and self.check_key(): + return self.fetch_key() + + # Is it the value indicator? + if ch == ':' and self.check_value(): + return self.fetch_value() + + # Is it an alias? + if ch == '*': + return self.fetch_alias() + + # Is it an anchor? + if ch == '&': + return self.fetch_anchor() + + # Is it a tag? + if ch == '!': + return self.fetch_tag() + + # Is it a literal scalar? + if ch == '|' and not self.flow_level: + return self.fetch_literal() + + # Is it a folded scalar? + if ch == '>' and not self.flow_level: + return self.fetch_folded() + + # Is it a single quoted scalar? + if ch == "'": + return self.fetch_single() + + # Is it a double quoted scalar? + if ch == '"': + return self.fetch_double() + + # It must be a plain scalar then. + if self.check_plain(): + return self.fetch_plain() + + # No? It's an error. Let's produce a nice error message. + raise ScannerError( + 'while scanning for the next token', + None, + _F('found character {ch!r} that cannot start any token', ch=ch), + self.reader.get_mark(), + ) + + # Simple keys treatment. + + def next_possible_simple_key(self): + # type: () -> Any + # Return the number of the nearest possible simple key. Actually we + # don't need to loop through the whole dictionary. We may replace it + # with the following code: + # if not self.possible_simple_keys: + # return None + # return self.possible_simple_keys[ + # min(self.possible_simple_keys.keys())].token_number + min_token_number = None + for level in self.possible_simple_keys: + key = self.possible_simple_keys[level] + if min_token_number is None or key.token_number < min_token_number: + min_token_number = key.token_number + return min_token_number + + def stale_possible_simple_keys(self): + # type: () -> None + # Remove entries that are no longer possible simple keys. According to + # the YAML specification, simple keys + # - should be limited to a single line, + # - should be no longer than 1024 characters. + # Disabling this procedure will allow simple keys of any length and + # height (may cause problems if indentation is broken though). + for level in list(self.possible_simple_keys): + key = self.possible_simple_keys[level] + if key.line != self.reader.line or self.reader.index - key.index > 1024: + if key.required: + raise ScannerError( + 'while scanning a simple key', + key.mark, + "could not find expected ':'", + self.reader.get_mark(), + ) + del self.possible_simple_keys[level] + + def save_possible_simple_key(self): + # type: () -> None + # The next token may start a simple key. We check if it's possible + # and save its position. This function is called for + # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. + + # Check if a simple key is required at the current position. + required = not self.flow_level and self.indent == self.reader.column + + # The next token might be a simple key. Let's save it's number and + # position. + if self.allow_simple_key: + self.remove_possible_simple_key() + token_number = self.tokens_taken + len(self.tokens) + key = SimpleKey( + token_number, + required, + self.reader.index, + self.reader.line, + self.reader.column, + self.reader.get_mark(), + ) + self.possible_simple_keys[self.flow_level] = key + + def remove_possible_simple_key(self): + # type: () -> None + # Remove the saved possible key position at the current flow level. + if self.flow_level in self.possible_simple_keys: + key = self.possible_simple_keys[self.flow_level] + + if key.required: + raise ScannerError( + 'while scanning a simple key', + key.mark, + "could not find expected ':'", + self.reader.get_mark(), + ) + + del self.possible_simple_keys[self.flow_level] + + # Indentation functions. + + def unwind_indent(self, column): + # type: (Any) -> None + # In flow context, tokens should respect indentation. + # Actually the condition should be `self.indent >= column` according to + # the spec. But this condition will prohibit intuitively correct + # constructions such as + # key : { + # } + # #### + # if self.flow_level and self.indent > column: + # raise ScannerError(None, None, + # "invalid intendation or unclosed '[' or '{'", + # self.reader.get_mark()) + + # In the flow context, indentation is ignored. We make the scanner less + # restrictive then specification requires. + if bool(self.flow_level): + return + + # In block context, we may need to issue the BLOCK-END tokens. + while self.indent > column: + mark = self.reader.get_mark() + self.indent = self.indents.pop() + self.tokens.append(BlockEndToken(mark, mark)) + + def add_indent(self, column): + # type: (int) -> bool + # Check if we need to increase indentation. + if self.indent < column: + self.indents.append(self.indent) + self.indent = column + return True + return False + + # Fetchers. + + def fetch_stream_start(self): + # type: () -> None + # We always add STREAM-START as the first token and STREAM-END as the + # last token. + # Read the token. + mark = self.reader.get_mark() + # Add STREAM-START. + self.tokens.append(StreamStartToken(mark, mark, encoding=self.reader.encoding)) + + def fetch_stream_end(self): + # type: () -> None + # Set the current intendation to -1. + self.unwind_indent(-1) + # Reset simple keys. + self.remove_possible_simple_key() + self.allow_simple_key = False + self.possible_simple_keys = {} + # Read the token. + mark = self.reader.get_mark() + # Add STREAM-END. + self.tokens.append(StreamEndToken(mark, mark)) + # The steam is finished. + self.done = True + + def fetch_directive(self): + # type: () -> None + # Set the current intendation to -1. + self.unwind_indent(-1) + + # Reset simple keys. + self.remove_possible_simple_key() + self.allow_simple_key = False + + # Scan and add DIRECTIVE. + self.tokens.append(self.scan_directive()) + + def fetch_document_start(self): + # type: () -> None + self.fetch_document_indicator(DocumentStartToken) + + def fetch_document_end(self): + # type: () -> None + self.fetch_document_indicator(DocumentEndToken) + + def fetch_document_indicator(self, TokenClass): + # type: (Any) -> None + # Set the current intendation to -1. + self.unwind_indent(-1) + + # Reset simple keys. Note that there could not be a block collection + # after '---'. + self.remove_possible_simple_key() + self.allow_simple_key = False + + # Add DOCUMENT-START or DOCUMENT-END. + start_mark = self.reader.get_mark() + self.reader.forward(3) + end_mark = self.reader.get_mark() + self.tokens.append(TokenClass(start_mark, end_mark)) + + def fetch_flow_sequence_start(self): + # type: () -> None + self.fetch_flow_collection_start(FlowSequenceStartToken, to_push='[') + + def fetch_flow_mapping_start(self): + # type: () -> None + self.fetch_flow_collection_start(FlowMappingStartToken, to_push='{') + + def fetch_flow_collection_start(self, TokenClass, to_push): + # type: (Any, Text) -> None + # '[' and '{' may start a simple key. + self.save_possible_simple_key() + # Increase the flow level. + self.flow_context.append(to_push) + # Simple keys are allowed after '[' and '{'. + self.allow_simple_key = True + # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. + start_mark = self.reader.get_mark() + self.reader.forward() + end_mark = self.reader.get_mark() + self.tokens.append(TokenClass(start_mark, end_mark)) + + def fetch_flow_sequence_end(self): + # type: () -> None + self.fetch_flow_collection_end(FlowSequenceEndToken) + + def fetch_flow_mapping_end(self): + # type: () -> None + self.fetch_flow_collection_end(FlowMappingEndToken) + + def fetch_flow_collection_end(self, TokenClass): + # type: (Any) -> None + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + # Decrease the flow level. + try: + popped = self.flow_context.pop() # NOQA + except IndexError: + # We must not be in a list or object. + # Defer error handling to the parser. + pass + # No simple keys after ']' or '}'. + self.allow_simple_key = False + # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. + start_mark = self.reader.get_mark() + self.reader.forward() + end_mark = self.reader.get_mark() + self.tokens.append(TokenClass(start_mark, end_mark)) + + def fetch_flow_entry(self): + # type: () -> None + # Simple keys are allowed after ','. + self.allow_simple_key = True + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + # Add FLOW-ENTRY. + start_mark = self.reader.get_mark() + self.reader.forward() + end_mark = self.reader.get_mark() + self.tokens.append(FlowEntryToken(start_mark, end_mark)) + + def fetch_block_entry(self): + # type: () -> None + # Block context needs additional checks. + if not self.flow_level: + # Are we allowed to start a new entry? + if not self.allow_simple_key: + raise ScannerError( + None, + None, + 'sequence entries are not allowed here', + self.reader.get_mark(), + ) + # We may need to add BLOCK-SEQUENCE-START. + if self.add_indent(self.reader.column): + mark = self.reader.get_mark() + self.tokens.append(BlockSequenceStartToken(mark, mark)) + # It's an error for the block entry to occur in the flow context, + # but we let the parser detect this. + else: + pass + # Simple keys are allowed after '-'. + self.allow_simple_key = True + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + + # Add BLOCK-ENTRY. + start_mark = self.reader.get_mark() + self.reader.forward() + end_mark = self.reader.get_mark() + self.tokens.append(BlockEntryToken(start_mark, end_mark)) + + def fetch_key(self): + # type: () -> None + # Block context needs additional checks. + if not self.flow_level: + + # Are we allowed to start a key (not nessesary a simple)? + if not self.allow_simple_key: + raise ScannerError( + None, + None, + 'mapping keys are not allowed here', + self.reader.get_mark(), + ) + + # We may need to add BLOCK-MAPPING-START. + if self.add_indent(self.reader.column): + mark = self.reader.get_mark() + self.tokens.append(BlockMappingStartToken(mark, mark)) + + # Simple keys are allowed after '?' in the block context. + self.allow_simple_key = not self.flow_level + + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + + # Add KEY. + start_mark = self.reader.get_mark() + self.reader.forward() + end_mark = self.reader.get_mark() + self.tokens.append(KeyToken(start_mark, end_mark)) + + def fetch_value(self): + # type: () -> None + # Do we determine a simple key? + if self.flow_level in self.possible_simple_keys: + # Add KEY. + key = self.possible_simple_keys[self.flow_level] + del self.possible_simple_keys[self.flow_level] + self.tokens.insert( + key.token_number - self.tokens_taken, KeyToken(key.mark, key.mark) + ) + + # If this key starts a new block mapping, we need to add + # BLOCK-MAPPING-START. + if not self.flow_level: + if self.add_indent(key.column): + self.tokens.insert( + key.token_number - self.tokens_taken, + BlockMappingStartToken(key.mark, key.mark), + ) + + # There cannot be two simple keys one after another. + self.allow_simple_key = False + + # It must be a part of a complex key. + else: + + # Block context needs additional checks. + # (Do we really need them? They will be caught by the parser + # anyway.) + if not self.flow_level: + + # We are allowed to start a complex value if and only if + # we can start a simple key. + if not self.allow_simple_key: + raise ScannerError( + None, + None, + 'mapping values are not allowed here', + self.reader.get_mark(), + ) + + # If this value starts a new block mapping, we need to add + # BLOCK-MAPPING-START. It will be detected as an error later by + # the parser. + if not self.flow_level: + if self.add_indent(self.reader.column): + mark = self.reader.get_mark() + self.tokens.append(BlockMappingStartToken(mark, mark)) + + # Simple keys are allowed after ':' in the block context. + self.allow_simple_key = not self.flow_level + + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + + # Add VALUE. + start_mark = self.reader.get_mark() + self.reader.forward() + end_mark = self.reader.get_mark() + self.tokens.append(ValueToken(start_mark, end_mark)) + + def fetch_alias(self): + # type: () -> None + # ALIAS could be a simple key. + self.save_possible_simple_key() + # No simple keys after ALIAS. + self.allow_simple_key = False + # Scan and add ALIAS. + self.tokens.append(self.scan_anchor(AliasToken)) + + def fetch_anchor(self): + # type: () -> None + # ANCHOR could start a simple key. + self.save_possible_simple_key() + # No simple keys after ANCHOR. + self.allow_simple_key = False + # Scan and add ANCHOR. + self.tokens.append(self.scan_anchor(AnchorToken)) + + def fetch_tag(self): + # type: () -> None + # TAG could start a simple key. + self.save_possible_simple_key() + # No simple keys after TAG. + self.allow_simple_key = False + # Scan and add TAG. + self.tokens.append(self.scan_tag()) + + def fetch_literal(self): + # type: () -> None + self.fetch_block_scalar(style='|') + + def fetch_folded(self): + # type: () -> None + self.fetch_block_scalar(style='>') + + def fetch_block_scalar(self, style): + # type: (Any) -> None + # A simple key may follow a block scalar. + self.allow_simple_key = True + # Reset possible simple key on the current level. + self.remove_possible_simple_key() + # Scan and add SCALAR. + self.tokens.append(self.scan_block_scalar(style)) + + def fetch_single(self): + # type: () -> None + self.fetch_flow_scalar(style="'") + + def fetch_double(self): + # type: () -> None + self.fetch_flow_scalar(style='"') + + def fetch_flow_scalar(self, style): + # type: (Any) -> None + # A flow scalar could be a simple key. + self.save_possible_simple_key() + # No simple keys after flow scalars. + self.allow_simple_key = False + # Scan and add SCALAR. + self.tokens.append(self.scan_flow_scalar(style)) + + def fetch_plain(self): + # type: () -> None + # A plain scalar could be a simple key. + self.save_possible_simple_key() + # No simple keys after plain scalars. But note that `scan_plain` will + # change this flag if the scan is finished at the beginning of the + # line. + self.allow_simple_key = False + # Scan and add SCALAR. May change `allow_simple_key`. + self.tokens.append(self.scan_plain()) + + # Checkers. + + def check_directive(self): + # type: () -> Any + # DIRECTIVE: ^ '%' ... + # The '%' indicator is already checked. + if self.reader.column == 0: + return True + return None + + def check_document_start(self): + # type: () -> Any + # DOCUMENT-START: ^ '---' (' '|'\n') + if self.reader.column == 0: + if ( + self.reader.prefix(3) == '---' + and self.reader.peek(3) in _THE_END_SPACE_TAB + ): + return True + return None + + def check_document_end(self): + # type: () -> Any + # DOCUMENT-END: ^ '...' (' '|'\n') + if self.reader.column == 0: + if ( + self.reader.prefix(3) == '...' + and self.reader.peek(3) in _THE_END_SPACE_TAB + ): + return True + return None + + def check_block_entry(self): + # type: () -> Any + # BLOCK-ENTRY: '-' (' '|'\n') + return self.reader.peek(1) in _THE_END_SPACE_TAB + + def check_key(self): + # type: () -> Any + # KEY(flow context): '?' + if bool(self.flow_level): + return True + # KEY(block context): '?' (' '|'\n') + return self.reader.peek(1) in _THE_END_SPACE_TAB + + def check_value(self): + # type: () -> Any + # VALUE(flow context): ':' + if self.scanner_processing_version == (1, 1): + if bool(self.flow_level): + return True + else: + if bool(self.flow_level): + if self.flow_context[-1] == '[': + if self.reader.peek(1) not in _THE_END_SPACE_TAB: + return False + elif self.tokens and isinstance(self.tokens[-1], ValueToken): + # mapping flow context scanning a value token + if self.reader.peek(1) not in _THE_END_SPACE_TAB: + return False + return True + # VALUE(block context): ':' (' '|'\n') + return self.reader.peek(1) in _THE_END_SPACE_TAB + + def check_plain(self): + # type: () -> Any + # A plain scalar may start with any non-space character except: + # '-', '?', ':', ',', '[', ']', '{', '}', + # '#', '&', '*', '!', '|', '>', '\'', '\"', + # '%', '@', '`'. + # + # It may also start with + # '-', '?', ':' + # if it is followed by a non-space character. + # + # Note that we limit the last rule to the block context (except the + # '-' character) because we want the flow context to be space + # independent. + srp = self.reader.peek + ch = srp() + if self.scanner_processing_version == (1, 1): + return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`' or ( + srp(1) not in _THE_END_SPACE_TAB + and (ch == '-' or (not self.flow_level and ch in '?:')) + ) + # YAML 1.2 + if ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'"%@`': + # ################### ^ ??? + return True + ch1 = srp(1) + if ch == '-' and ch1 not in _THE_END_SPACE_TAB: + return True + if ch == ':' and bool(self.flow_level) and ch1 not in _SPACE_TAB: + return True + + return srp(1) not in _THE_END_SPACE_TAB and ( + ch == '-' or (not self.flow_level and ch in '?:') + ) + + # Scanners. + + def scan_to_next_token(self): + # type: () -> Any + # We ignore spaces, line breaks and comments. + # If we find a line break in the block context, we set the flag + # `allow_simple_key` on. + # The byte order mark is stripped if it's the first character in the + # stream. We do not yet support BOM inside the stream as the + # specification requires. Any such mark will be considered as a part + # of the document. + # + # TODO: We need to make tab handling rules more sane. A good rule is + # Tabs cannot precede tokens + # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, + # KEY(block), VALUE(block), BLOCK-ENTRY + # So the checking code is + # if <TAB>: + # self.allow_simple_keys = False + # We also need to add the check for `allow_simple_keys == True` to + # `unwind_indent` before issuing BLOCK-END. + # Scanners for block, flow, and plain scalars need to be modified. + srp = self.reader.peek + srf = self.reader.forward + if self.reader.index == 0 and srp() == '\uFEFF': + srf() + found = False + _the_end = _THE_END + while not found: + while srp() == ' ': + srf() + if srp() == '#': + while srp() not in _the_end: + srf() + if self.scan_line_break(): + if not self.flow_level: + self.allow_simple_key = True + else: + found = True + return None + + def scan_directive(self): + # type: () -> Any + # See the specification for details. + srp = self.reader.peek + srf = self.reader.forward + start_mark = self.reader.get_mark() + srf() + name = self.scan_directive_name(start_mark) + value = None + if name == 'YAML': + value = self.scan_yaml_directive_value(start_mark) + end_mark = self.reader.get_mark() + elif name == 'TAG': + value = self.scan_tag_directive_value(start_mark) + end_mark = self.reader.get_mark() + else: + end_mark = self.reader.get_mark() + while srp() not in _THE_END: + srf() + self.scan_directive_ignored_line(start_mark) + return DirectiveToken(name, value, start_mark, end_mark) + + def scan_directive_name(self, start_mark): + # type: (Any) -> Any + # See the specification for details. + length = 0 + srp = self.reader.peek + ch = srp(length) + while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_:.': + length += 1 + ch = srp(length) + if not length: + raise ScannerError( + 'while scanning a directive', + start_mark, + _F('expected alphabetic or numeric character, but found {ch!r}', ch=ch), + self.reader.get_mark(), + ) + value = self.reader.prefix(length) + self.reader.forward(length) + ch = srp() + if ch not in '\0 \r\n\x85\u2028\u2029': + raise ScannerError( + 'while scanning a directive', + start_mark, + _F('expected alphabetic or numeric character, but found {ch!r}', ch=ch), + self.reader.get_mark(), + ) + return value + + def scan_yaml_directive_value(self, start_mark): + # type: (Any) -> Any + # See the specification for details. + srp = self.reader.peek + srf = self.reader.forward + while srp() == ' ': + srf() + major = self.scan_yaml_directive_number(start_mark) + if srp() != '.': + raise ScannerError( + 'while scanning a directive', + start_mark, + _F("expected a digit or '.', but found {srp_call!r}", srp_call=srp()), + self.reader.get_mark(), + ) + srf() + minor = self.scan_yaml_directive_number(start_mark) + if srp() not in '\0 \r\n\x85\u2028\u2029': + raise ScannerError( + 'while scanning a directive', + start_mark, + _F("expected a digit or '.', but found {srp_call!r}", srp_call=srp()), + self.reader.get_mark(), + ) + self.yaml_version = (major, minor) + return self.yaml_version + + def scan_yaml_directive_number(self, start_mark): + # type: (Any) -> Any + # See the specification for details. + srp = self.reader.peek + srf = self.reader.forward + ch = srp() + if not ('0' <= ch <= '9'): + raise ScannerError( + 'while scanning a directive', + start_mark, + _F('expected a digit, but found {ch!r}', ch=ch), + self.reader.get_mark(), + ) + length = 0 + while '0' <= srp(length) <= '9': + length += 1 + value = int(self.reader.prefix(length)) + srf(length) + return value + + def scan_tag_directive_value(self, start_mark): + # type: (Any) -> Any + # See the specification for details. + srp = self.reader.peek + srf = self.reader.forward + while srp() == ' ': + srf() + handle = self.scan_tag_directive_handle(start_mark) + while srp() == ' ': + srf() + prefix = self.scan_tag_directive_prefix(start_mark) + return (handle, prefix) + + def scan_tag_directive_handle(self, start_mark): + # type: (Any) -> Any + # See the specification for details. + value = self.scan_tag_handle('directive', start_mark) + ch = self.reader.peek() + if ch != ' ': + raise ScannerError( + 'while scanning a directive', + start_mark, + _F("expected ' ', but found {ch!r}", ch=ch), + self.reader.get_mark(), + ) + return value + + def scan_tag_directive_prefix(self, start_mark): + # type: (Any) -> Any + # See the specification for details. + value = self.scan_tag_uri('directive', start_mark) + ch = self.reader.peek() + if ch not in '\0 \r\n\x85\u2028\u2029': + raise ScannerError( + 'while scanning a directive', + start_mark, + _F("expected ' ', but found {ch!r}", ch=ch), + self.reader.get_mark(), + ) + return value + + def scan_directive_ignored_line(self, start_mark): + # type: (Any) -> None + # See the specification for details. + srp = self.reader.peek + srf = self.reader.forward + while srp() == ' ': + srf() + if srp() == '#': + while srp() not in _THE_END: + srf() + ch = srp() + if ch not in _THE_END: + raise ScannerError( + 'while scanning a directive', + start_mark, + _F('expected a comment or a line break, but found {ch!r}', ch=ch), + self.reader.get_mark(), + ) + self.scan_line_break() + + def scan_anchor(self, TokenClass): + # type: (Any) -> Any + # The specification does not restrict characters for anchors and + # aliases. This may lead to problems, for instance, the document: + # [ *alias, value ] + # can be interpteted in two ways, as + # [ "value" ] + # and + # [ *alias , "value" ] + # Therefore we restrict aliases to numbers and ASCII letters. + srp = self.reader.peek + start_mark = self.reader.get_mark() + indicator = srp() + if indicator == '*': + name = 'alias' + else: + name = 'anchor' + self.reader.forward() + length = 0 + ch = srp(length) + # while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ + # or ch in '-_': + while check_anchorname_char(ch): + length += 1 + ch = srp(length) + if not length: + raise ScannerError( + _F('while scanning an {name!s}', name=name), + start_mark, + _F('expected alphabetic or numeric character, but found {ch!r}', ch=ch), + self.reader.get_mark(), + ) + value = self.reader.prefix(length) + self.reader.forward(length) + # ch1 = ch + # ch = srp() # no need to peek, ch is already set + # assert ch1 == ch + if ch not in '\0 \t\r\n\x85\u2028\u2029?:,[]{}%@`': + raise ScannerError( + _F('while scanning an {name!s}', name=name), + start_mark, + _F('expected alphabetic or numeric character, but found {ch!r}', ch=ch), + self.reader.get_mark(), + ) + end_mark = self.reader.get_mark() + return TokenClass(value, start_mark, end_mark) + + def scan_tag(self): + # type: () -> Any + # See the specification for details. + srp = self.reader.peek + start_mark = self.reader.get_mark() + ch = srp(1) + if ch == '<': + handle = None + self.reader.forward(2) + suffix = self.scan_tag_uri('tag', start_mark) + if srp() != '>': + raise ScannerError( + 'while parsing a tag', + start_mark, + _F("expected '>', but found {srp_call!r}", srp_call=srp()), + self.reader.get_mark(), + ) + self.reader.forward() + elif ch in _THE_END_SPACE_TAB: + handle = None + suffix = '!' + self.reader.forward() + else: + length = 1 + use_handle = False + while ch not in '\0 \r\n\x85\u2028\u2029': + if ch == '!': + use_handle = True + break + length += 1 + ch = srp(length) + handle = '!' + if use_handle: + handle = self.scan_tag_handle('tag', start_mark) + else: + handle = '!' + self.reader.forward() + suffix = self.scan_tag_uri('tag', start_mark) + ch = srp() + if ch not in '\0 \r\n\x85\u2028\u2029': + raise ScannerError( + 'while scanning a tag', + start_mark, + _F("expected ' ', but found {ch!r}", ch=ch), + self.reader.get_mark(), + ) + value = (handle, suffix) + end_mark = self.reader.get_mark() + return TagToken(value, start_mark, end_mark) + + def scan_block_scalar(self, style, rt=False): + # type: (Any, Optional[bool]) -> Any + # See the specification for details. + srp = self.reader.peek + if style == '>': + folded = True + else: + folded = False + + chunks = [] # type: List[Any] + start_mark = self.reader.get_mark() + + # Scan the header. + self.reader.forward() + chomping, increment = self.scan_block_scalar_indicators(start_mark) + # block scalar comment e.g. : |+ # comment text + block_scalar_comment = self.scan_block_scalar_ignored_line(start_mark) + + # Determine the indentation level and go to the first non-empty line. + min_indent = self.indent + 1 + if increment is None: + # no increment and top level, min_indent could be 0 + if min_indent < 1 and ( + style not in '|>' + or (self.scanner_processing_version == (1, 1)) + and getattr( + self.loader, + 'top_level_block_style_scalar_no_indent_error_1_1', + False, + ) + ): + min_indent = 1 + breaks, max_indent, end_mark = self.scan_block_scalar_indentation() + indent = max(min_indent, max_indent) + else: + if min_indent < 1: + min_indent = 1 + indent = min_indent + increment - 1 + breaks, end_mark = self.scan_block_scalar_breaks(indent) + line_break = "" + + # Scan the inner part of the block scalar. + while self.reader.column == indent and srp() != '\0': + chunks.extend(breaks) + leading_non_space = srp() not in ' \t' + length = 0 + while srp(length) not in _THE_END: + length += 1 + chunks.append(self.reader.prefix(length)) + self.reader.forward(length) + line_break = self.scan_line_break() + breaks, end_mark = self.scan_block_scalar_breaks(indent) + if style in '|>' and min_indent == 0: + # at the beginning of a line, if in block style see if + # end of document/start_new_document + if self.check_document_start() or self.check_document_end(): + break + if self.reader.column == indent and srp() != '\0': + + # Unfortunately, folding rules are ambiguous. + # + # This is the folding according to the specification: + + if rt and folded and line_break == '\n': + chunks.append('\a') + if ( + folded + and line_break == '\n' + and leading_non_space + and srp() not in ' \t' + ): + if not breaks: + chunks.append(' ') + else: + chunks.append(line_break) + + # This is Clark Evans's interpretation (also in the spec + # examples): + # + # if folded and line_break == '\n': + # if not breaks: + # if srp() not in ' \t': + # chunks.append(' ') + # else: + # chunks.append(line_break) + # else: + # chunks.append(line_break) + else: + break + + # Process trailing line breaks. The 'chomping' setting determines + # whether they are included in the value. + trailing = [] # type: List[Any] + if chomping in [None, True]: + chunks.append(line_break) + if chomping is True: + chunks.extend(breaks) + elif chomping in [None, False]: + trailing.extend(breaks) + + # We are done. + token = ScalarToken("".join(chunks), False, start_mark, end_mark, style) + if self.loader is not None: + comment_handler = getattr(self.loader, 'comment_handling', False) + if comment_handler is None: + if block_scalar_comment is not None: + token.add_pre_comments([block_scalar_comment]) + if len(trailing) > 0: + # Eat whitespaces and comments until we reach the next token. + if self.loader is not None: + comment_handler = getattr(self.loader, 'comment_handling', None) + if comment_handler is not None: + line = end_mark.line - len(trailing) + for x in trailing: + assert x[-1] == '\n' + self.comments.add_blank_line(x, 0, line) # type: ignore + line += 1 + comment = self.scan_to_next_token() + while comment: + trailing.append(' ' * comment[1].column + comment[0]) + comment = self.scan_to_next_token() + if self.loader is not None: + comment_handler = getattr(self.loader, 'comment_handling', False) + if comment_handler is None: + # Keep track of the trailing whitespace and following comments + # as a comment token, if isn't all included in the actual value. + comment_end_mark = self.reader.get_mark() + comment = CommentToken( + "".join(trailing), end_mark, comment_end_mark + ) + token.add_post_comment(comment) + return token + + def scan_block_scalar_indicators(self, start_mark): + # type: (Any) -> Any + # See the specification for details. + srp = self.reader.peek + chomping = None + increment = None + ch = srp() + if ch in '+-': + if ch == '+': + chomping = True + else: + chomping = False + self.reader.forward() + ch = srp() + if ch in '0123456789': + increment = int(ch) + if increment == 0: + raise ScannerError( + 'while scanning a block scalar', + start_mark, + 'expected indentation indicator in the range 1-9, ' + 'but found 0', + self.reader.get_mark(), + ) + self.reader.forward() + elif ch in '0123456789': + increment = int(ch) + if increment == 0: + raise ScannerError( + 'while scanning a block scalar', + start_mark, + 'expected indentation indicator in the range 1-9, ' 'but found 0', + self.reader.get_mark(), + ) + self.reader.forward() + ch = srp() + if ch in '+-': + if ch == '+': + chomping = True + else: + chomping = False + self.reader.forward() + ch = srp() + if ch not in '\0 \r\n\x85\u2028\u2029': + raise ScannerError( + 'while scanning a block scalar', + start_mark, + _F( + 'expected chomping or indentation indicators, but found {ch!r}', + ch=ch, + ), + self.reader.get_mark(), + ) + return chomping, increment + + def scan_block_scalar_ignored_line(self, start_mark): + # type: (Any) -> Any + # See the specification for details. + srp = self.reader.peek + srf = self.reader.forward + prefix = '' + comment = None + while srp() == ' ': + prefix += srp() + srf() + if srp() == '#': + comment = prefix + while srp() not in _THE_END: + comment += srp() + srf() + ch = srp() + if ch not in _THE_END: + raise ScannerError( + 'while scanning a block scalar', + start_mark, + _F('expected a comment or a line break, but found {ch!r}', ch=ch), + self.reader.get_mark(), + ) + self.scan_line_break() + return comment + + def scan_block_scalar_indentation(self): + # type: () -> Any + # See the specification for details. + srp = self.reader.peek + srf = self.reader.forward + chunks = [] + max_indent = 0 + end_mark = self.reader.get_mark() + while srp() in ' \r\n\x85\u2028\u2029': + if srp() != ' ': + chunks.append(self.scan_line_break()) + end_mark = self.reader.get_mark() + else: + srf() + if self.reader.column > max_indent: + max_indent = self.reader.column + return chunks, max_indent, end_mark + + def scan_block_scalar_breaks(self, indent): + # type: (int) -> Any + # See the specification for details. + chunks = [] + srp = self.reader.peek + srf = self.reader.forward + end_mark = self.reader.get_mark() + while self.reader.column < indent and srp() == ' ': + srf() + while srp() in '\r\n\x85\u2028\u2029': + chunks.append(self.scan_line_break()) + end_mark = self.reader.get_mark() + while self.reader.column < indent and srp() == ' ': + srf() + return chunks, end_mark + + def scan_flow_scalar(self, style): + # type: (Any) -> Any + # See the specification for details. + # Note that we loose indentation rules for quoted scalars. Quoted + # scalars don't need to adhere indentation because " and ' clearly + # mark the beginning and the end of them. Therefore we are less + # restrictive then the specification requires. We only need to check + # that document separators are not included in scalars. + if style == '"': + double = True + else: + double = False + srp = self.reader.peek + chunks = [] # type: List[Any] + start_mark = self.reader.get_mark() + quote = srp() + self.reader.forward() + chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) + while srp() != quote: + chunks.extend(self.scan_flow_scalar_spaces(double, start_mark)) + chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) + self.reader.forward() + end_mark = self.reader.get_mark() + return ScalarToken("".join(chunks), False, start_mark, end_mark, style) + + ESCAPE_REPLACEMENTS = { + '0': '\0', + 'a': '\x07', + 'b': '\x08', + 't': '\x09', + '\t': '\x09', + 'n': '\x0A', + 'v': '\x0B', + 'f': '\x0C', + 'r': '\x0D', + 'e': '\x1B', + ' ': '\x20', + '"': '"', + '/': '/', # as per http://www.json.org/ + '\\': '\\', + 'N': '\x85', + '_': '\xA0', + 'L': '\u2028', + 'P': '\u2029', + } + + ESCAPE_CODES = {'x': 2, 'u': 4, 'U': 8} + + def scan_flow_scalar_non_spaces(self, double, start_mark): + # type: (Any, Any) -> Any + # See the specification for details. + chunks = [] # type: List[Any] + srp = self.reader.peek + srf = self.reader.forward + while True: + length = 0 + while srp(length) not in ' \n\'"\\\0\t\r\x85\u2028\u2029': + length += 1 + if length != 0: + chunks.append(self.reader.prefix(length)) + srf(length) + ch = srp() + if not double and ch == "'" and srp(1) == "'": + chunks.append("'") + srf(2) + elif (double and ch == "'") or (not double and ch in '"\\'): + chunks.append(ch) + srf() + elif double and ch == '\\': + srf() + ch = srp() + if ch in self.ESCAPE_REPLACEMENTS: + chunks.append(self.ESCAPE_REPLACEMENTS[ch]) + srf() + elif ch in self.ESCAPE_CODES: + length = self.ESCAPE_CODES[ch] + srf() + for k in range(length): + if srp(k) not in '0123456789ABCDEFabcdef': + raise ScannerError( + 'while scanning a double-quoted scalar', + start_mark, + _F( + 'expected escape sequence of {length:d} hexdecimal ' + 'numbers, but found {srp_call!r}', + length=length, + srp_call=srp(k), + ), + self.reader.get_mark(), + ) + code = int(self.reader.prefix(length), 16) + chunks.append(chr(code)) + srf(length) + elif ch in '\n\r\x85\u2028\u2029': + self.scan_line_break() + chunks.extend(self.scan_flow_scalar_breaks(double, start_mark)) + else: + raise ScannerError( + 'while scanning a double-quoted scalar', + start_mark, + _F('found unknown escape character {ch!r}', ch=ch), + self.reader.get_mark(), + ) + else: + return chunks + + def scan_flow_scalar_spaces(self, double, start_mark): + # type: (Any, Any) -> Any + # See the specification for details. + srp = self.reader.peek + chunks = [] + length = 0 + while srp(length) in ' \t': + length += 1 + whitespaces = self.reader.prefix(length) + self.reader.forward(length) + ch = srp() + if ch == '\0': + raise ScannerError( + 'while scanning a quoted scalar', + start_mark, + 'found unexpected end of stream', + self.reader.get_mark(), + ) + elif ch in '\r\n\x85\u2028\u2029': + line_break = self.scan_line_break() + breaks = self.scan_flow_scalar_breaks(double, start_mark) + if line_break != '\n': + chunks.append(line_break) + elif not breaks: + chunks.append(' ') + chunks.extend(breaks) + else: + chunks.append(whitespaces) + return chunks + + def scan_flow_scalar_breaks(self, double, start_mark): + # type: (Any, Any) -> Any + # See the specification for details. + chunks = [] # type: List[Any] + srp = self.reader.peek + srf = self.reader.forward + while True: + # Instead of checking indentation, we check for document + # separators. + prefix = self.reader.prefix(3) + if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB: + raise ScannerError( + 'while scanning a quoted scalar', + start_mark, + 'found unexpected document separator', + self.reader.get_mark(), + ) + while srp() in ' \t': + srf() + if srp() in '\r\n\x85\u2028\u2029': + chunks.append(self.scan_line_break()) + else: + return chunks + + def scan_plain(self): + # type: () -> Any + # See the specification for details. + # We add an additional restriction for the flow context: + # plain scalars in the flow context cannot contain ',', ': ' and '?'. + # We also keep track of the `allow_simple_key` flag here. + # Indentation rules are loosed for the flow context. + srp = self.reader.peek + srf = self.reader.forward + chunks = [] # type: List[Any] + start_mark = self.reader.get_mark() + end_mark = start_mark + indent = self.indent + 1 + # We allow zero indentation for scalars, but then we need to check for + # document separators at the beginning of the line. + # if indent == 0: + # indent = 1 + spaces = [] # type: List[Any] + while True: + length = 0 + if srp() == '#': + break + while True: + ch = srp(length) + if ch == ':' and srp(length + 1) not in _THE_END_SPACE_TAB: + pass + elif ch == '?' and self.scanner_processing_version != (1, 1): + pass + elif ( + ch in _THE_END_SPACE_TAB + or ( + not self.flow_level + and ch == ':' + and srp(length + 1) in _THE_END_SPACE_TAB + ) + or (self.flow_level and ch in ',:?[]{}') + ): + break + length += 1 + # It's not clear what we should do with ':' in the flow context. + if ( + self.flow_level + and ch == ':' + and srp(length + 1) not in '\0 \t\r\n\x85\u2028\u2029,[]{}' + ): + srf(length) + raise ScannerError( + 'while scanning a plain scalar', + start_mark, + "found unexpected ':'", + self.reader.get_mark(), + 'Please check ' + 'http://pyyaml.org/wiki/YAMLColonInFlowContext ' + 'for details.', + ) + if length == 0: + break + self.allow_simple_key = False + chunks.extend(spaces) + chunks.append(self.reader.prefix(length)) + srf(length) + end_mark = self.reader.get_mark() + spaces = self.scan_plain_spaces(indent, start_mark) + if ( + not spaces + or srp() == '#' + or (not self.flow_level and self.reader.column < indent) + ): + break + + token = ScalarToken("".join(chunks), True, start_mark, end_mark) + # getattr provides True so C type loader, which cannot handle comment, + # will not make CommentToken + if self.loader is not None: + comment_handler = getattr(self.loader, 'comment_handling', False) + if comment_handler is None: + if spaces and spaces[0] == '\n': + # Create a comment token to preserve the trailing line breaks. + comment = CommentToken("".join(spaces) + '\n', start_mark, end_mark) + token.add_post_comment(comment) + elif comment_handler is not False: + line = start_mark.line + 1 + for ch in spaces: + if ch == '\n': + self.comments.add_blank_line('\n', 0, line) # type: ignore + line += 1 + + return token + + def scan_plain_spaces(self, indent, start_mark): + # type: (Any, Any) -> Any + # See the specification for details. + # The specification is really confusing about tabs in plain scalars. + # We just forbid them completely. Do not use tabs in YAML! + srp = self.reader.peek + srf = self.reader.forward + chunks = [] + length = 0 + while srp(length) in ' ': + length += 1 + whitespaces = self.reader.prefix(length) + self.reader.forward(length) + ch = srp() + if ch in '\r\n\x85\u2028\u2029': + line_break = self.scan_line_break() + self.allow_simple_key = True + prefix = self.reader.prefix(3) + if (prefix == '---' or prefix == '...') and srp(3) in _THE_END_SPACE_TAB: + return + breaks = [] + while srp() in ' \r\n\x85\u2028\u2029': + if srp() == ' ': + srf() + else: + breaks.append(self.scan_line_break()) + prefix = self.reader.prefix(3) + if (prefix == '---' or prefix == '...') and srp( + 3 + ) in _THE_END_SPACE_TAB: + return + if line_break != '\n': + chunks.append(line_break) + elif not breaks: + chunks.append(' ') + chunks.extend(breaks) + elif whitespaces: + chunks.append(whitespaces) + return chunks + + def scan_tag_handle(self, name, start_mark): + # type: (Any, Any) -> Any + # See the specification for details. + # For some strange reasons, the specification does not allow '_' in + # tag handles. I have allowed it anyway. + srp = self.reader.peek + ch = srp() + if ch != '!': + raise ScannerError( + _F('while scanning an {name!s}', name=name), + start_mark, + _F("expected '!', but found {ch!r}", ch=ch), + self.reader.get_mark(), + ) + length = 1 + ch = srp(length) + if ch != ' ': + while ( + '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' or ch in '-_' + ): + length += 1 + ch = srp(length) + if ch != '!': + self.reader.forward(length) + raise ScannerError( + _F('while scanning an {name!s}', name=name), + start_mark, + _F("expected '!', but found {ch!r}", ch=ch), + self.reader.get_mark(), + ) + length += 1 + value = self.reader.prefix(length) + self.reader.forward(length) + return value + + def scan_tag_uri(self, name, start_mark): + # type: (Any, Any) -> Any + # See the specification for details. + # Note: we do not check if URI is well-formed. + srp = self.reader.peek + chunks = [] + length = 0 + ch = srp(length) + while ( + '0' <= ch <= '9' + or 'A' <= ch <= 'Z' + or 'a' <= ch <= 'z' + or ch in "-;/?:@&=+$,_.!~*'()[]%" + or ((self.scanner_processing_version > (1, 1)) and ch == '#') + ): + if ch == '%': + chunks.append(self.reader.prefix(length)) + self.reader.forward(length) + length = 0 + chunks.append(self.scan_uri_escapes(name, start_mark)) + else: + length += 1 + ch = srp(length) + if length != 0: + chunks.append(self.reader.prefix(length)) + self.reader.forward(length) + length = 0 + if not chunks: + raise ScannerError( + _F('while parsing an {name!s}', name=name), + start_mark, + _F('expected URI, but found {ch!r}', ch=ch), + self.reader.get_mark(), + ) + return "".join(chunks) + + def scan_uri_escapes(self, name, start_mark): + # type: (Any, Any) -> Any + # See the specification for details. + srp = self.reader.peek + srf = self.reader.forward + code_bytes = [] # type: List[Any] + mark = self.reader.get_mark() + while srp() == '%': + srf() + for k in range(2): + if srp(k) not in '0123456789ABCDEFabcdef': + raise ScannerError( + _F('while scanning an {name!s}', name=name), + start_mark, + _F( + 'expected URI escape sequence of 2 hexdecimal numbers,' + ' but found {srp_call!r}', + srp_call=srp(k), + ), + self.reader.get_mark(), + ) + code_bytes.append(int(self.reader.prefix(2), 16)) + srf(2) + try: + value = bytes(code_bytes).decode('utf-8') + except UnicodeDecodeError as exc: + raise ScannerError( + _F('while scanning an {name!s}', name=name), start_mark, str(exc), mark + ) + return value + + def scan_line_break(self): + # type: () -> Any + # Transforms: + # '\r\n' : '\n' + # '\r' : '\n' + # '\n' : '\n' + # '\x85' : '\n' + # '\u2028' : '\u2028' + # '\u2029 : '\u2029' + # default : '' + ch = self.reader.peek() + if ch in '\r\n\x85': + if self.reader.prefix(2) == '\r\n': + self.reader.forward(2) + else: + self.reader.forward() + return '\n' + elif ch in '\u2028\u2029': + self.reader.forward() + return ch + return "" + + +class RoundTripScanner(Scanner): + def check_token(self, *choices): + # type: (Any) -> bool + # Check if the next token is one of the given types. + while self.need_more_tokens(): + self.fetch_more_tokens() + self._gather_comments() + if len(self.tokens) > 0: + if not choices: + return True + for choice in choices: + if isinstance(self.tokens[0], choice): + return True + return False + + def peek_token(self): + # type: () -> Any + # Return the next token, but do not delete if from the queue. + while self.need_more_tokens(): + self.fetch_more_tokens() + self._gather_comments() + if len(self.tokens) > 0: + return self.tokens[0] + return None + + def _gather_comments(self): + # type: () -> Any + """combine multiple comment lines and assign to next non-comment-token""" + comments = [] # type: List[Any] + if not self.tokens: + return comments + if isinstance(self.tokens[0], CommentToken): + comment = self.tokens.pop(0) + self.tokens_taken += 1 + comments.append(comment) + while self.need_more_tokens(): + self.fetch_more_tokens() + if not self.tokens: + return comments + if isinstance(self.tokens[0], CommentToken): + self.tokens_taken += 1 + comment = self.tokens.pop(0) + # nprint('dropping2', comment) + comments.append(comment) + if len(comments) >= 1: + self.tokens[0].add_pre_comments(comments) + # pull in post comment on e.g. ':' + if not self.done and len(self.tokens) < 2: + self.fetch_more_tokens() + + def get_token(self): + # type: () -> Any + # Return the next token. + while self.need_more_tokens(): + self.fetch_more_tokens() + self._gather_comments() + if len(self.tokens) > 0: + # nprint('tk', self.tokens) + # only add post comment to single line tokens: + # scalar, value token. FlowXEndToken, otherwise + # hidden streamtokens could get them (leave them and they will be + # pre comments for the next map/seq + if ( + len(self.tokens) > 1 + and isinstance( + self.tokens[0], + ( + ScalarToken, + ValueToken, + FlowSequenceEndToken, + FlowMappingEndToken, + ), + ) + and isinstance(self.tokens[1], CommentToken) + and self.tokens[0].end_mark.line == self.tokens[1].start_mark.line + ): + self.tokens_taken += 1 + c = self.tokens.pop(1) + self.fetch_more_tokens() + while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken): + self.tokens_taken += 1 + c1 = self.tokens.pop(1) + c.value = c.value + (' ' * c1.start_mark.column) + c1.value + self.fetch_more_tokens() + self.tokens[0].add_post_comment(c) + elif ( + len(self.tokens) > 1 + and isinstance(self.tokens[0], ScalarToken) + and isinstance(self.tokens[1], CommentToken) + and self.tokens[0].end_mark.line != self.tokens[1].start_mark.line + ): + self.tokens_taken += 1 + c = self.tokens.pop(1) + c.value = ( + '\n' * (c.start_mark.line - self.tokens[0].end_mark.line) + + (' ' * c.start_mark.column) + + c.value + ) + self.tokens[0].add_post_comment(c) + self.fetch_more_tokens() + while len(self.tokens) > 1 and isinstance(self.tokens[1], CommentToken): + self.tokens_taken += 1 + c1 = self.tokens.pop(1) + c.value = c.value + (' ' * c1.start_mark.column) + c1.value + self.fetch_more_tokens() + self.tokens_taken += 1 + return self.tokens.pop(0) + return None + + def fetch_comment(self, comment): + # type: (Any) -> None + value, start_mark, end_mark = comment + while value and value[-1] == ' ': + # empty line within indented key context + # no need to update end-mark, that is not used + value = value[:-1] + self.tokens.append(CommentToken(value, start_mark, end_mark)) + + # scanner + + def scan_to_next_token(self): + # type: () -> Any + # We ignore spaces, line breaks and comments. + # If we find a line break in the block context, we set the flag + # `allow_simple_key` on. + # The byte order mark is stripped if it's the first character in the + # stream. We do not yet support BOM inside the stream as the + # specification requires. Any such mark will be considered as a part + # of the document. + # + # TODO: We need to make tab handling rules more sane. A good rule is + # Tabs cannot precede tokens + # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, + # KEY(block), VALUE(block), BLOCK-ENTRY + # So the checking code is + # if <TAB>: + # self.allow_simple_keys = False + # We also need to add the check for `allow_simple_keys == True` to + # `unwind_indent` before issuing BLOCK-END. + # Scanners for block, flow, and plain scalars need to be modified. + + srp = self.reader.peek + srf = self.reader.forward + if self.reader.index == 0 and srp() == '\uFEFF': + srf() + found = False + while not found: + while srp() == ' ': + srf() + ch = srp() + if ch == '#': + start_mark = self.reader.get_mark() + comment = ch + srf() + while ch not in _THE_END: + ch = srp() + if ch == '\0': # don't gobble the end-of-stream character + # but add an explicit newline as "YAML processors should terminate + # the stream with an explicit line break + # https://yaml.org/spec/1.2/spec.html#id2780069 + comment += '\n' + break + comment += ch + srf() + # gather any blank lines following the comment too + ch = self.scan_line_break() + while len(ch) > 0: + comment += ch + ch = self.scan_line_break() + end_mark = self.reader.get_mark() + if not self.flow_level: + self.allow_simple_key = True + return comment, start_mark, end_mark + if self.scan_line_break() != '': + start_mark = self.reader.get_mark() + if not self.flow_level: + self.allow_simple_key = True + ch = srp() + if ch == '\n': # empty toplevel lines + start_mark = self.reader.get_mark() + comment = "" + while ch: + ch = self.scan_line_break(empty_line=True) + comment += ch + if srp() == '#': + # empty line followed by indented real comment + comment = comment.rsplit('\n', 1)[0] + '\n' + end_mark = self.reader.get_mark() + return comment, start_mark, end_mark + else: + found = True + return None + + def scan_line_break(self, empty_line=False): + # type: (bool) -> Text + # Transforms: + # '\r\n' : '\n' + # '\r' : '\n' + # '\n' : '\n' + # '\x85' : '\n' + # '\u2028' : '\u2028' + # '\u2029 : '\u2029' + # default : '' + ch = self.reader.peek() # type: Text + if ch in '\r\n\x85': + if self.reader.prefix(2) == '\r\n': + self.reader.forward(2) + else: + self.reader.forward() + return '\n' + elif ch in '\u2028\u2029': + self.reader.forward() + return ch + elif empty_line and ch in '\t ': + self.reader.forward() + return ch + return "" + + def scan_block_scalar(self, style, rt=True): + # type: (Any, Optional[bool]) -> Any + return Scanner.scan_block_scalar(self, style, rt=rt) + + +# commenthandling 2021, differentiatiation not needed + +VALUECMNT = 0 +KEYCMNT = 0 # 1 +# TAGCMNT = 2 +# ANCHORCMNT = 3 + + +class CommentBase: + __slots__ = ( + 'value', + 'line', + 'column', + 'used', + 'function', + 'fline', + 'ufun', + 'uline', + ) + + def __init__(self, value, line, column): + # type: (Any, Any, Any) -> None + self.value = value + self.line = line + self.column = column + self.used = ' ' + info = inspect.getframeinfo(inspect.stack()[3][0]) + self.function = info.function + self.fline = info.lineno + self.ufun = None + self.uline = None + + def set_used(self, v='+'): + # type: (Any) -> None + self.used = v + info = inspect.getframeinfo(inspect.stack()[1][0]) + self.ufun = info.function # type: ignore + self.uline = info.lineno # type: ignore + + def set_assigned(self): + # type: () -> None + self.used = '|' + + def __str__(self): + # type: () -> str + return _F('{value}', value=self.value) # type: ignore + + def __repr__(self): + # type: () -> str + return _F('{value!r}', value=self.value) # type: ignore + + def info(self): + # type: () -> str + return _F( # type: ignore + '{name}{used} {line:2}:{column:<2} "{value:40s} {function}:{fline} {ufun}:{uline}', + name=self.name, # type: ignore + line=self.line, + column=self.column, + value=self.value + '"', + used=self.used, + function=self.function, + fline=self.fline, + ufun=self.ufun, + uline=self.uline, + ) + + +class EOLComment(CommentBase): + name = 'EOLC' + + def __init__(self, value, line, column): + # type: (Any, Any, Any) -> None + super().__init__(value, line, column) + + +class FullLineComment(CommentBase): + name = 'FULL' + + def __init__(self, value, line, column): + # type: (Any, Any, Any) -> None + super().__init__(value, line, column) + + +class BlankLineComment(CommentBase): + name = 'BLNK' + + def __init__(self, value, line, column): + # type: (Any, Any, Any) -> None + super().__init__(value, line, column) + + +class ScannedComments: + def __init__(self): + # type: (Any) -> None + self.comments = {} # type: ignore + self.unused = [] # type: ignore + + def add_eol_comment(self, comment, column, line): + # type: (Any, Any, Any) -> Any + # info = inspect.getframeinfo(inspect.stack()[1][0]) + if comment.count('\n') == 1: + assert comment[-1] == '\n' + else: + assert '\n' not in comment + self.comments[line] = retval = EOLComment(comment[:-1], line, column) + self.unused.append(line) + return retval + + def add_blank_line(self, comment, column, line): + # type: (Any, Any, Any) -> Any + # info = inspect.getframeinfo(inspect.stack()[1][0]) + assert comment.count('\n') == 1 and comment[-1] == '\n' + assert line not in self.comments + self.comments[line] = retval = BlankLineComment(comment[:-1], line, column) + self.unused.append(line) + return retval + + def add_full_line_comment(self, comment, column, line): + # type: (Any, Any, Any) -> Any + # info = inspect.getframeinfo(inspect.stack()[1][0]) + assert comment.count('\n') == 1 and comment[-1] == '\n' + # if comment.startswith('# C12'): + # raise + # this raises in line 2127 fro 330 + self.comments[line] = retval = FullLineComment(comment[:-1], line, column) + self.unused.append(line) + return retval + + def __getitem__(self, idx): + # type: (Any) -> Any + return self.comments[idx] + + def __str__(self): + # type: () -> Any + return ( + 'ParsedComments:\n ' + + '\n '.join( + ( + _F('{lineno:2} {x}', lineno=lineno, x=x.info()) + for lineno, x in self.comments.items() + ) + ) + + '\n' + ) + + def last(self): + # type: () -> str + lineno, x = list(self.comments.items())[-1] + return _F('{lineno:2} {x}\n', lineno=lineno, x=x.info()) # type: ignore + + def any_unprocessed(self): + # type: () -> bool + # ToDo: might want to differentiate based on lineno + return len(self.unused) > 0 + # for lno, comment in reversed(self.comments.items()): + # if comment.used == ' ': + # return True + # return False + + def unprocessed(self, use=False): + # type: (Any) -> Any + while len(self.unused) > 0: + first = self.unused.pop(0) if use else self.unused[0] + info = inspect.getframeinfo(inspect.stack()[1][0]) + xprintf( + 'using', first, self.comments[first].value, info.function, info.lineno + ) + yield first, self.comments[first] + if use: + self.comments[first].set_used() + + def assign_pre(self, token): + # type: (Any) -> Any + token_line = token.start_mark.line + info = inspect.getframeinfo(inspect.stack()[1][0]) + xprintf('assign_pre', token_line, self.unused, info.function, info.lineno) + gobbled = False + while self.unused and self.unused[0] < token_line: + gobbled = True + first = self.unused.pop(0) + xprintf('assign_pre < ', first) + self.comments[first].set_used() + token.add_comment_pre(first) + return gobbled + + def assign_eol(self, tokens): + # type: (Any) -> Any + try: + comment_line = self.unused[0] + except IndexError: + return + if not isinstance(self.comments[comment_line], EOLComment): + return + idx = 1 + while tokens[-idx].start_mark.line > comment_line or isinstance( + tokens[-idx], ValueToken + ): + idx += 1 + xprintf('idx1', idx) + if ( + len(tokens) > idx + and isinstance(tokens[-idx], ScalarToken) + and isinstance(tokens[-(idx + 1)], ScalarToken) + ): + return + try: + if isinstance(tokens[-idx], ScalarToken) and isinstance( + tokens[-(idx + 1)], KeyToken + ): + try: + eol_idx = self.unused.pop(0) + self.comments[eol_idx].set_used() + xprintf('>>>>>a', idx, eol_idx, KEYCMNT) + tokens[-idx].add_comment_eol(eol_idx, KEYCMNT) + except IndexError: + raise NotImplementedError + return + except IndexError: + xprintf('IndexError1') + pass + try: + if isinstance(tokens[-idx], ScalarToken) and isinstance( + tokens[-(idx + 1)], (ValueToken, BlockEntryToken) + ): + try: + eol_idx = self.unused.pop(0) + self.comments[eol_idx].set_used() + tokens[-idx].add_comment_eol(eol_idx, VALUECMNT) + except IndexError: + raise NotImplementedError + return + except IndexError: + xprintf('IndexError2') + pass + for t in tokens: + xprintf('tt-', t) + xprintf('not implemented EOL', type(tokens[-idx])) + import sys + + sys.exit(0) + + def assign_post(self, token): + # type: (Any) -> Any + token_line = token.start_mark.line + info = inspect.getframeinfo(inspect.stack()[1][0]) + xprintf('assign_post', token_line, self.unused, info.function, info.lineno) + gobbled = False + while self.unused and self.unused[0] < token_line: + gobbled = True + first = self.unused.pop(0) + xprintf('assign_post < ', first) + self.comments[first].set_used() + token.add_comment_post(first) + return gobbled + + def str_unprocessed(self): + # type: () -> Any + return ''.join( + ( + _F(' {ind:2} {x}\n', ind=ind, x=x.info()) + for ind, x in self.comments.items() + if x.used == ' ' + ) + ) + + +class RoundTripScannerSC(Scanner): # RoundTripScanner Split Comments + def __init__(self, *arg, **kw): + # type: (Any, Any) -> None + super().__init__(*arg, **kw) + assert self.loader is not None + # comments isinitialised on .need_more_tokens and persist on + # self.loader.parsed_comments + self.comments = None + + def get_token(self): + # type: () -> Any + # Return the next token. + while self.need_more_tokens(): + self.fetch_more_tokens() + if len(self.tokens) > 0: + if isinstance(self.tokens[0], BlockEndToken): + self.comments.assign_post(self.tokens[0]) # type: ignore + else: + self.comments.assign_pre(self.tokens[0]) # type: ignore + self.tokens_taken += 1 + return self.tokens.pop(0) + + def need_more_tokens(self): + # type: () -> bool + if self.comments is None: + self.loader.parsed_comments = self.comments = ScannedComments() # type: ignore + if self.done: + return False + if len(self.tokens) == 0: + return True + # The current token may be a potential simple key, so we + # need to look further. + self.stale_possible_simple_keys() + if self.next_possible_simple_key() == self.tokens_taken: + return True + if len(self.tokens) < 2: + return True + if self.tokens[0].start_mark.line == self.tokens[-1].start_mark.line: + return True + if True: + xprintf('-x--', len(self.tokens)) + for t in self.tokens: + xprintf(t) + # xprintf(self.comments.last()) + xprintf(self.comments.str_unprocessed()) # type: ignore + self.comments.assign_pre(self.tokens[0]) # type: ignore + self.comments.assign_eol(self.tokens) # type: ignore + return False + + def scan_to_next_token(self): + # type: () -> None + srp = self.reader.peek + srf = self.reader.forward + if self.reader.index == 0 and srp() == '\uFEFF': + srf() + start_mark = self.reader.get_mark() + # xprintf('current_mark', start_mark.line, start_mark.column) + found = False + while not found: + while srp() == ' ': + srf() + ch = srp() + if ch == '#': + comment_start_mark = self.reader.get_mark() + comment = ch + srf() # skipt the '#' + while ch not in _THE_END: + ch = srp() + if ch == '\0': # don't gobble the end-of-stream character + # but add an explicit newline as "YAML processors should terminate + # the stream with an explicit line break + # https://yaml.org/spec/1.2/spec.html#id2780069 + comment += '\n' + break + comment += ch + srf() + # we have a comment + if start_mark.column == 0: + self.comments.add_full_line_comment( # type: ignore + comment, comment_start_mark.column, comment_start_mark.line + ) + else: + self.comments.add_eol_comment( # type: ignore + comment, comment_start_mark.column, comment_start_mark.line + ) + comment = "" + # gather any blank lines or full line comments following the comment as well + self.scan_empty_or_full_line_comments() + if not self.flow_level: + self.allow_simple_key = True + return + if bool(self.scan_line_break()): + # start_mark = self.reader.get_mark() + if not self.flow_level: + self.allow_simple_key = True + self.scan_empty_or_full_line_comments() + return None + ch = srp() + if ch == '\n': # empty toplevel lines + start_mark = self.reader.get_mark() + comment = "" + while ch: + ch = self.scan_line_break(empty_line=True) + comment += ch + if srp() == '#': + # empty line followed by indented real comment + comment = comment.rsplit('\n', 1)[0] + '\n' + _ = self.reader.get_mark() # gobble end_mark + return None + else: + found = True + return None + + def scan_empty_or_full_line_comments(self): + # type: () -> None + blmark = self.reader.get_mark() + assert blmark.column == 0 + blanks = "" + comment = None + mark = None + ch = self.reader.peek() + while True: + # nprint('ch', repr(ch), self.reader.get_mark().column) + if ch in '\r\n\x85\u2028\u2029': + if self.reader.prefix(2) == '\r\n': + self.reader.forward(2) + else: + self.reader.forward() + if comment is not None: + comment += '\n' + self.comments.add_full_line_comment(comment, mark.column, mark.line) + comment = None + else: + blanks += '\n' + self.comments.add_blank_line(blanks, blmark.column, blmark.line) # type: ignore # NOQA + blanks = "" + blmark = self.reader.get_mark() + ch = self.reader.peek() + continue + if comment is None: + if ch in ' \t': + blanks += ch + elif ch == '#': + mark = self.reader.get_mark() + comment = '#' + else: + # print('breaking on', repr(ch)) + break + else: + comment += ch + self.reader.forward() + ch = self.reader.peek() + + def scan_block_scalar_ignored_line(self, start_mark): + # type: (Any) -> Any + # See the specification for details. + srp = self.reader.peek + srf = self.reader.forward + prefix = '' + comment = None + while srp() == ' ': + prefix += srp() + srf() + if srp() == '#': + comment = '' + mark = self.reader.get_mark() + while srp() not in _THE_END: + comment += srp() + srf() + comment += '\n' # type: ignore + ch = srp() + if ch not in _THE_END: + raise ScannerError( + 'while scanning a block scalar', + start_mark, + _F('expected a comment or a line break, but found {ch!r}', ch=ch), + self.reader.get_mark(), + ) + if comment is not None: + self.comments.add_eol_comment(comment, mark.column, mark.line) # type: ignore + self.scan_line_break() + return None |