diff options
Diffstat (limited to '')
-rw-r--r-- | tests/test_data.py | 285 |
1 files changed, 285 insertions, 0 deletions
diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..719f8bf --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,285 @@ +""" + Data Tests + ~~~~~~~~~~ + + :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import time + +import pytest + +from pygments.lexers.data import JsonLexer, JsonBareObjectLexer, JsonLdLexer +from pygments.token import Comment, Error, Token, Punctuation, Number, String, \ + Keyword, Name, Whitespace + + +@pytest.fixture(scope='module') +def lexer_json(): + yield JsonLexer() + + +@pytest.fixture(scope='module') +def lexer_bare(): + yield JsonBareObjectLexer() + + +@pytest.fixture(scope='module') +def lexer_json_ld(): + yield JsonLdLexer() + + +@pytest.mark.parametrize( + 'text, expected_token_types', + ( + # Integers + ('0', (Number.Integer,)), + ('-1', (Number.Integer,)), + ('1234567890', (Number.Integer,)), + ('-1234567890', (Number.Integer,)), + + # Floats, including scientific notation + ('123456789.0123456789', (Number.Float,)), + ('-123456789.0123456789', (Number.Float,)), + ('1e10', (Number.Float,)), + ('-1E10', (Number.Float,)), + ('1e-10', (Number.Float,)), + ('-1E+10', (Number.Float,)), + ('1.0e10', (Number.Float,)), + ('-1.0E10', (Number.Float,)), + ('1.0e-10', (Number.Float,)), + ('-1.0E+10', (Number.Float,)), + + # Strings (escapes are tested elsewhere) + ('""', (String.Double,)), + ('"abc"', (String.Double,)), + ('"ひらがな"', (String.Double,)), + ('"123"', (String.Double,)), + ('"[]"', (String.Double,)), + ('"{}"', (String.Double,)), + ('"true"', (String.Double,)), + ('"false"', (String.Double,)), + ('"null"', (String.Double,)), + ('":,"', (String.Double,)), + + # Constants + ('true', (Keyword.Constant, )), + ('false', (Keyword.Constant, )), + ('null', (Keyword.Constant, )), + + # Arrays + ('[]', (Punctuation,)), + ('["a", "b"]', (Punctuation, String.Double, Punctuation, + Whitespace, String.Double, Punctuation)), + + # Objects + ('{}', (Punctuation,)), + ('{"a": "b"}', (Punctuation, Name.Tag, Punctuation, + Whitespace, String.Double, Punctuation)), + ) +) +@pytest.mark.parametrize('end', ('', '\n')) +def test_json_literals_positive_match(lexer_json, text, expected_token_types, end): + """Validate that syntactically-correct JSON literals are parsed correctly.""" + + tokens = list(lexer_json.get_tokens_unprocessed(text + end)) + assert len(tokens) == len(expected_token_types) + bool(end) + assert all(token[1] is expected_token + for token, expected_token in zip(tokens, expected_token_types)) + assert ''.join(token[2] for token in tokens) == text + end + + +@pytest.mark.parametrize( + 'text, expected', + ( + ('\u0020', Whitespace), # space + ('\u000a', Whitespace), # newline + ('\u000d', Whitespace), # carriage return + ('\u0009', Whitespace), # tab + ) +) +def test_json_whitespace_positive_match(lexer_json, text, expected): + """Validate that whitespace is parsed correctly.""" + + tokens = list(lexer_json.get_tokens_unprocessed(text)) + assert tokens == [(0, expected, text)] + + # Expand the whitespace and verify parsing again. + tokens = list(lexer_json.get_tokens_unprocessed(text * 2 + ' ')) + assert tokens == [(0, expected, text * 2 + ' ')] + + +@pytest.mark.parametrize( + 'text', + ( + '"', '\\', '/', 'b', 'f', 'n', 'r', 't', + 'u0123', 'u4567', 'u89ab', 'ucdef', 'uABCD', 'uEF01', + ) +) +def test_json_object_key_escapes_positive_match(lexer_json, text): + """Validate that escape sequences in JSON object keys are parsed correctly.""" + + tokens = list(lexer_json.get_tokens_unprocessed('{"\\%s": 1}' % text)) + assert len(tokens) == 6 + assert tokens[1][1] is Name.Tag + assert tokens[1][2] == '"\\%s"' % text + + +@pytest.mark.parametrize('text', ('uz', 'u1z', 'u12z', 'u123z')) +def test_json_string_unicode_escapes_negative_match(lexer_json, text): + """Validate that if unicode escape sequences end abruptly there's no problem.""" + + tokens = list(lexer_json.get_tokens_unprocessed('"\\%s"' % text)) + assert len(tokens) == 1 + assert tokens[0] == (0, String.Double, '"\\%s"' % text) + + +@pytest.mark.parametrize( + 'text', + ( + '"', '\\', '/', 'b', 'f', 'n', 'r', 't', + 'u0123', 'u4567', 'u89ab', 'ucdef', 'uABCD', 'uEF01', + ) +) +def test_json_string_escapes_positive_match(lexer_json, text): + """Validate that escape sequences in JSON string values are parsed correctly.""" + + text = '"\\%s"' % text + tokens = list(lexer_json.get_tokens_unprocessed(text)) + assert len(tokens) == 1 + assert tokens[0][1] is String.Double + assert tokens[0][2] == text + + +@pytest.mark.parametrize('text', ('+\n', '0\n', '""0\n', 'a\nb\n', '""/-')) +def test_json_round_trip_errors(lexer_json, text): + """Validate that past round-trip errors never crop up again.""" + + tokens = list(lexer_json.get_tokens_unprocessed(text)) + assert ''.join(t[2] for t in tokens) == text + + +def test_json_comments_single_line_positive_matches(lexer_json): + """Verify that single-line comments are tokenized correctly.""" + + text = '{"a//b"//C1\n:123/////C2\n}\n// // C3' + tokens = list(lexer_json.get_tokens_unprocessed(text)) + assert tokens[2] == (7, Comment.Single, "//C1") + assert tokens[6] == (16, Comment.Single, "/////C2") + assert tokens[10] == (26, Comment.Single, "// // C3") + + comment_count = sum(1 for token in tokens if token[1] == Comment or token[1].parent == Comment) + assert comment_count == 3 + + parsed_text = ''.join(token[2] for token in tokens) + assert parsed_text == text, 'Input and output texts must match!' + + +def test_json_comments_multiline_positive_matches(lexer_json): + """Verify that multiline comments are tokenized correctly.""" + + text = '/** / **/{"a /**/ b"/* \n */:123}' + tokens = list(lexer_json.get_tokens_unprocessed(text)) + assert tokens[0] == (0, Comment.Multiline, "/** / **/") + assert tokens[3] == (20, Comment.Multiline, "/* \n */") + + comment_count = sum(1 for token in tokens if token[1] == Comment or token[1].parent == Comment) + assert comment_count == 2 + + parsed_text = ''.join(token[2] for token in tokens) + assert parsed_text == text, 'Input and output texts must match!' + + +@pytest.mark.parametrize( + "text, expected", + ( + # Unfinished comment openers + ('/', (0, Error, '/')), + ('1/', (1, Error, '/')), + ('/1', (0, Error, '/')), + ('""/', (2, Error, '/')), + # Unclosed multiline comments + ('/*', (0, Error, '/*')), + ('/**', (0, Error, '/**')), + ('/*/', (0, Error, '/*/')), + ('1/*', (1, Error, '/*')), + ('""/*', (2, Error, '/*')), + ('""/**', (2, Error, '/**')), + ) +) +def test_json_comments_negative_matches(lexer_json, text, expected): + """Verify that the unfinished or unclosed comments are parsed as errors.""" + + tokens = list(lexer_json.get_tokens_unprocessed(text)) + assert expected in tokens + + parsed_text = ''.join(token[2] for token in tokens) + assert parsed_text == text, 'Input and output texts must match!' + + +def test_json_escape_backtracking(lexer_json): + """Confirm that there is no catastrophic backtracking in the lexer. + + This no longer applies because the JSON lexer doesn't use regular expressions, + but the test is included to ensure no loss of functionality now or in the future. + """ + + fragment = r'{"\u00D0000\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\63CD' + start_time = time.time() + list(lexer_json.get_tokens(fragment)) + assert time.time() - start_time < 1, 'The JSON lexer may have catastrophic backtracking' + + +@pytest.mark.parametrize( + 'keyword', + ( + 'base', + 'container', + 'context', + 'direction', + 'graph', + 'id', + 'import', + 'included', + 'index', + 'json', + 'language', + 'list', + 'nest', + 'none', + 'prefix', + 'propagate', + 'protected', + 'reverse', + 'set', + 'type', + 'value', + 'version', + 'vocab', + ) +) +def test_json_ld_keywords_positive_match(lexer_json_ld, keyword): + """Validate that JSON-LD keywords are parsed correctly.""" + + tokens = list(lexer_json_ld.get_tokens_unprocessed('{"@%s": ""}' % keyword)) + assert len(tokens) == 6 + assert tokens[1][1] is Token.Name.Decorator + assert tokens[1][2] == '"@%s"' % keyword + + +@pytest.mark.parametrize( + 'keyword', + ( + '@bogus', # "@" does not guarantee a keyword match + '@bases', # Begins with the keyword "@base" + 'container', # Matches "container" but has no leading "@" + ) +) +def test_json_ld_keywords_negative_match(lexer_json_ld, keyword): + """Validate that JSON-LD non-keywords are parsed correctly.""" + + tokens = list(lexer_json_ld.get_tokens_unprocessed('{"%s": ""}' % keyword)) + assert len(tokens) == 6 + assert tokens[1][1] is Token.Name.Tag + assert tokens[1][2] == '"%s"' % keyword |