diff options
Diffstat (limited to 'lib/ansible/parsing/splitter.py')
-rw-r--r-- | lib/ansible/parsing/splitter.py | 286 |
1 files changed, 286 insertions, 0 deletions
diff --git a/lib/ansible/parsing/splitter.py b/lib/ansible/parsing/splitter.py new file mode 100644 index 0000000..b68444f --- /dev/null +++ b/lib/ansible/parsing/splitter.py @@ -0,0 +1,286 @@ +# (c) 2014 James Cammarata, <jcammarata@ansible.com> +# +# This file is part of Ansible +# +# Ansible is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Ansible is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Ansible. If not, see <http://www.gnu.org/licenses/>. + +# Make coding more python3-ish +from __future__ import (absolute_import, division, print_function) +__metaclass__ = type + +import codecs +import re + +from ansible.errors import AnsibleParserError +from ansible.module_utils._text import to_text +from ansible.parsing.quoting import unquote + + +# Decode escapes adapted from rspeer's answer here: +# http://stackoverflow.com/questions/4020539/process-escape-sequences-in-a-string-in-python +_HEXCHAR = '[a-fA-F0-9]' +_ESCAPE_SEQUENCE_RE = re.compile(r''' + ( \\U{0} # 8-digit hex escapes + | \\u{1} # 4-digit hex escapes + | \\x{2} # 2-digit hex escapes + | \\N\{{[^}}]+\}} # Unicode characters by name + | \\[\\'"abfnrtv] # Single-character escapes + )'''.format(_HEXCHAR * 8, _HEXCHAR * 4, _HEXCHAR * 2), re.UNICODE | re.VERBOSE) + + +def _decode_escapes(s): + def decode_match(match): + return codecs.decode(match.group(0), 'unicode-escape') + + return _ESCAPE_SEQUENCE_RE.sub(decode_match, s) + + +def parse_kv(args, check_raw=False): + ''' + Convert a string of key/value items to a dict. If any free-form params + are found and the check_raw option is set to True, they will be added + to a new parameter called '_raw_params'. If check_raw is not enabled, + they will simply be ignored. + ''' + + args = to_text(args, nonstring='passthru') + + options = {} + if args is not None: + try: + vargs = split_args(args) + except IndexError as e: + raise AnsibleParserError("Unable to parse argument string", orig_exc=e) + except ValueError as ve: + if 'no closing quotation' in str(ve).lower(): + raise AnsibleParserError("error parsing argument string, try quoting the entire line.", orig_exc=ve) + else: + raise + + raw_params = [] + for orig_x in vargs: + x = _decode_escapes(orig_x) + if "=" in x: + pos = 0 + try: + while True: + pos = x.index('=', pos + 1) + if pos > 0 and x[pos - 1] != '\\': + break + except ValueError: + # ran out of string, but we must have some escaped equals, + # so replace those and append this to the list of raw params + raw_params.append(x.replace('\\=', '=')) + continue + + k = x[:pos] + v = x[pos + 1:] + + # FIXME: make the retrieval of this list of shell/command options a function, so the list is centralized + if check_raw and k not in ('creates', 'removes', 'chdir', 'executable', 'warn', 'stdin', 'stdin_add_newline', 'strip_empty_ends'): + raw_params.append(orig_x) + else: + options[k.strip()] = unquote(v.strip()) + else: + raw_params.append(orig_x) + + # recombine the free-form params, if any were found, and assign + # them to a special option for use later by the shell/command module + if len(raw_params) > 0: + options[u'_raw_params'] = join_args(raw_params) + + return options + + +def _get_quote_state(token, quote_char): + ''' + the goal of this block is to determine if the quoted string + is unterminated in which case it needs to be put back together + ''' + # the char before the current one, used to see if + # the current character is escaped + prev_char = None + for idx, cur_char in enumerate(token): + if idx > 0: + prev_char = token[idx - 1] + if cur_char in '"\'' and prev_char != '\\': + if quote_char: + if cur_char == quote_char: + quote_char = None + else: + quote_char = cur_char + return quote_char + + +def _count_jinja2_blocks(token, cur_depth, open_token, close_token): + ''' + this function counts the number of opening/closing blocks for a + given opening/closing type and adjusts the current depth for that + block based on the difference + ''' + num_open = token.count(open_token) + num_close = token.count(close_token) + if num_open != num_close: + cur_depth += (num_open - num_close) + if cur_depth < 0: + cur_depth = 0 + return cur_depth + + +def join_args(s): + ''' + Join the original cmd based on manipulations by split_args(). + This retains the original newlines and whitespaces. + ''' + result = '' + for p in s: + if len(result) == 0 or result.endswith('\n'): + result += p + else: + result += ' ' + p + return result + + +def split_args(args): + ''' + Splits args on whitespace, but intelligently reassembles + those that may have been split over a jinja2 block or quotes. + + When used in a remote module, we won't ever have to be concerned about + jinja2 blocks, however this function is/will be used in the + core portions as well before the args are templated. + + example input: a=b c="foo bar" + example output: ['a=b', 'c="foo bar"'] + + Basically this is a variation shlex that has some more intelligence for + how Ansible needs to use it. + ''' + + # the list of params parsed out of the arg string + # this is going to be the result value when we are done + params = [] + + # Initial split on newlines + items = args.split('\n') + + # iterate over the tokens, and reassemble any that may have been + # split on a space inside a jinja2 block. + # ex if tokens are "{{", "foo", "}}" these go together + + # These variables are used + # to keep track of the state of the parsing, since blocks and quotes + # may be nested within each other. + + quote_char = None + inside_quotes = False + print_depth = 0 # used to count nested jinja2 {{ }} blocks + block_depth = 0 # used to count nested jinja2 {% %} blocks + comment_depth = 0 # used to count nested jinja2 {# #} blocks + + # now we loop over each split chunk, coalescing tokens if the white space + # split occurred within quotes or a jinja2 block of some kind + for (itemidx, item) in enumerate(items): + + # we split on spaces and newlines separately, so that we + # can tell which character we split on for reassembly + # inside quotation characters + tokens = item.split(' ') + + line_continuation = False + for (idx, token) in enumerate(tokens): + + # Empty entries means we have subsequent spaces + # We want to hold onto them so we can reconstruct them later + if len(token) == 0 and idx != 0: + params[-1] += ' ' + continue + + # if we hit a line continuation character, but + # we're not inside quotes, ignore it and continue + # on to the next token while setting a flag + if token == '\\' and not inside_quotes: + line_continuation = True + continue + + # store the previous quoting state for checking later + was_inside_quotes = inside_quotes + quote_char = _get_quote_state(token, quote_char) + inside_quotes = quote_char is not None + + # multiple conditions may append a token to the list of params, + # so we keep track with this flag to make sure it only happens once + # append means add to the end of the list, don't append means concatenate + # it to the end of the last token + appended = False + + # if we're inside quotes now, but weren't before, append the token + # to the end of the list, since we'll tack on more to it later + # otherwise, if we're inside any jinja2 block, inside quotes, or we were + # inside quotes (but aren't now) concat this token to the last param + if inside_quotes and not was_inside_quotes and not (print_depth or block_depth or comment_depth): + params.append(token) + appended = True + elif print_depth or block_depth or comment_depth or inside_quotes or was_inside_quotes: + if idx == 0 and was_inside_quotes: + params[-1] = "%s%s" % (params[-1], token) + elif len(tokens) > 1: + spacer = '' + if idx > 0: + spacer = ' ' + params[-1] = "%s%s%s" % (params[-1], spacer, token) + else: + params[-1] = "%s\n%s" % (params[-1], token) + appended = True + + # if the number of paired block tags is not the same, the depth has changed, so we calculate that here + # and may append the current token to the params (if we haven't previously done so) + prev_print_depth = print_depth + print_depth = _count_jinja2_blocks(token, print_depth, "{{", "}}") + if print_depth != prev_print_depth and not appended: + params.append(token) + appended = True + + prev_block_depth = block_depth + block_depth = _count_jinja2_blocks(token, block_depth, "{%", "%}") + if block_depth != prev_block_depth and not appended: + params.append(token) + appended = True + + prev_comment_depth = comment_depth + comment_depth = _count_jinja2_blocks(token, comment_depth, "{#", "#}") + if comment_depth != prev_comment_depth and not appended: + params.append(token) + appended = True + + # finally, if we're at zero depth for all blocks and not inside quotes, and have not + # yet appended anything to the list of params, we do so now + if not (print_depth or block_depth or comment_depth) and not inside_quotes and not appended and token != '': + params.append(token) + + # if this was the last token in the list, and we have more than + # one item (meaning we split on newlines), add a newline back here + # to preserve the original structure + if len(items) > 1 and itemidx != len(items) - 1 and not line_continuation: + params[-1] += '\n' + + # always clear the line continuation flag + line_continuation = False + + # If we're done and things are not at zero depth or we're still inside quotes, + # raise an error to indicate that the args were unbalanced + if print_depth or block_depth or comment_depth or inside_quotes: + raise AnsibleParserError(u"failed at splitting arguments, either an unbalanced jinja2 block or quotes: {0}".format(args)) + + return params |