summaryrefslogtreecommitdiffstats
path: root/tests/test_data.py
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--tests/test_data.py285
1 files changed, 285 insertions, 0 deletions
diff --git a/tests/test_data.py b/tests/test_data.py
new file mode 100644
index 0000000..719f8bf
--- /dev/null
+++ b/tests/test_data.py
@@ -0,0 +1,285 @@
+"""
+ Data Tests
+ ~~~~~~~~~~
+
+ :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
+ :license: BSD, see LICENSE for details.
+"""
+
+import time
+
+import pytest
+
+from pygments.lexers.data import JsonLexer, JsonBareObjectLexer, JsonLdLexer
+from pygments.token import Comment, Error, Token, Punctuation, Number, String, \
+ Keyword, Name, Whitespace
+
+
+@pytest.fixture(scope='module')
+def lexer_json():
+ yield JsonLexer()
+
+
+@pytest.fixture(scope='module')
+def lexer_bare():
+ yield JsonBareObjectLexer()
+
+
+@pytest.fixture(scope='module')
+def lexer_json_ld():
+ yield JsonLdLexer()
+
+
+@pytest.mark.parametrize(
+ 'text, expected_token_types',
+ (
+ # Integers
+ ('0', (Number.Integer,)),
+ ('-1', (Number.Integer,)),
+ ('1234567890', (Number.Integer,)),
+ ('-1234567890', (Number.Integer,)),
+
+ # Floats, including scientific notation
+ ('123456789.0123456789', (Number.Float,)),
+ ('-123456789.0123456789', (Number.Float,)),
+ ('1e10', (Number.Float,)),
+ ('-1E10', (Number.Float,)),
+ ('1e-10', (Number.Float,)),
+ ('-1E+10', (Number.Float,)),
+ ('1.0e10', (Number.Float,)),
+ ('-1.0E10', (Number.Float,)),
+ ('1.0e-10', (Number.Float,)),
+ ('-1.0E+10', (Number.Float,)),
+
+ # Strings (escapes are tested elsewhere)
+ ('""', (String.Double,)),
+ ('"abc"', (String.Double,)),
+ ('"ひらがな"', (String.Double,)),
+ ('"123"', (String.Double,)),
+ ('"[]"', (String.Double,)),
+ ('"{}"', (String.Double,)),
+ ('"true"', (String.Double,)),
+ ('"false"', (String.Double,)),
+ ('"null"', (String.Double,)),
+ ('":,"', (String.Double,)),
+
+ # Constants
+ ('true', (Keyword.Constant, )),
+ ('false', (Keyword.Constant, )),
+ ('null', (Keyword.Constant, )),
+
+ # Arrays
+ ('[]', (Punctuation,)),
+ ('["a", "b"]', (Punctuation, String.Double, Punctuation,
+ Whitespace, String.Double, Punctuation)),
+
+ # Objects
+ ('{}', (Punctuation,)),
+ ('{"a": "b"}', (Punctuation, Name.Tag, Punctuation,
+ Whitespace, String.Double, Punctuation)),
+ )
+)
+@pytest.mark.parametrize('end', ('', '\n'))
+def test_json_literals_positive_match(lexer_json, text, expected_token_types, end):
+ """Validate that syntactically-correct JSON literals are parsed correctly."""
+
+ tokens = list(lexer_json.get_tokens_unprocessed(text + end))
+ assert len(tokens) == len(expected_token_types) + bool(end)
+ assert all(token[1] is expected_token
+ for token, expected_token in zip(tokens, expected_token_types))
+ assert ''.join(token[2] for token in tokens) == text + end
+
+
+@pytest.mark.parametrize(
+ 'text, expected',
+ (
+ ('\u0020', Whitespace), # space
+ ('\u000a', Whitespace), # newline
+ ('\u000d', Whitespace), # carriage return
+ ('\u0009', Whitespace), # tab
+ )
+)
+def test_json_whitespace_positive_match(lexer_json, text, expected):
+ """Validate that whitespace is parsed correctly."""
+
+ tokens = list(lexer_json.get_tokens_unprocessed(text))
+ assert tokens == [(0, expected, text)]
+
+ # Expand the whitespace and verify parsing again.
+ tokens = list(lexer_json.get_tokens_unprocessed(text * 2 + ' '))
+ assert tokens == [(0, expected, text * 2 + ' ')]
+
+
+@pytest.mark.parametrize(
+ 'text',
+ (
+ '"', '\\', '/', 'b', 'f', 'n', 'r', 't',
+ 'u0123', 'u4567', 'u89ab', 'ucdef', 'uABCD', 'uEF01',
+ )
+)
+def test_json_object_key_escapes_positive_match(lexer_json, text):
+ """Validate that escape sequences in JSON object keys are parsed correctly."""
+
+ tokens = list(lexer_json.get_tokens_unprocessed('{"\\%s": 1}' % text))
+ assert len(tokens) == 6
+ assert tokens[1][1] is Name.Tag
+ assert tokens[1][2] == '"\\%s"' % text
+
+
+@pytest.mark.parametrize('text', ('uz', 'u1z', 'u12z', 'u123z'))
+def test_json_string_unicode_escapes_negative_match(lexer_json, text):
+ """Validate that if unicode escape sequences end abruptly there's no problem."""
+
+ tokens = list(lexer_json.get_tokens_unprocessed('"\\%s"' % text))
+ assert len(tokens) == 1
+ assert tokens[0] == (0, String.Double, '"\\%s"' % text)
+
+
+@pytest.mark.parametrize(
+ 'text',
+ (
+ '"', '\\', '/', 'b', 'f', 'n', 'r', 't',
+ 'u0123', 'u4567', 'u89ab', 'ucdef', 'uABCD', 'uEF01',
+ )
+)
+def test_json_string_escapes_positive_match(lexer_json, text):
+ """Validate that escape sequences in JSON string values are parsed correctly."""
+
+ text = '"\\%s"' % text
+ tokens = list(lexer_json.get_tokens_unprocessed(text))
+ assert len(tokens) == 1
+ assert tokens[0][1] is String.Double
+ assert tokens[0][2] == text
+
+
+@pytest.mark.parametrize('text', ('+\n', '0\n', '""0\n', 'a\nb\n', '""/-'))
+def test_json_round_trip_errors(lexer_json, text):
+ """Validate that past round-trip errors never crop up again."""
+
+ tokens = list(lexer_json.get_tokens_unprocessed(text))
+ assert ''.join(t[2] for t in tokens) == text
+
+
+def test_json_comments_single_line_positive_matches(lexer_json):
+ """Verify that single-line comments are tokenized correctly."""
+
+ text = '{"a//b"//C1\n:123/////C2\n}\n// // C3'
+ tokens = list(lexer_json.get_tokens_unprocessed(text))
+ assert tokens[2] == (7, Comment.Single, "//C1")
+ assert tokens[6] == (16, Comment.Single, "/////C2")
+ assert tokens[10] == (26, Comment.Single, "// // C3")
+
+ comment_count = sum(1 for token in tokens if token[1] == Comment or token[1].parent == Comment)
+ assert comment_count == 3
+
+ parsed_text = ''.join(token[2] for token in tokens)
+ assert parsed_text == text, 'Input and output texts must match!'
+
+
+def test_json_comments_multiline_positive_matches(lexer_json):
+ """Verify that multiline comments are tokenized correctly."""
+
+ text = '/** / **/{"a /**/ b"/* \n */:123}'
+ tokens = list(lexer_json.get_tokens_unprocessed(text))
+ assert tokens[0] == (0, Comment.Multiline, "/** / **/")
+ assert tokens[3] == (20, Comment.Multiline, "/* \n */")
+
+ comment_count = sum(1 for token in tokens if token[1] == Comment or token[1].parent == Comment)
+ assert comment_count == 2
+
+ parsed_text = ''.join(token[2] for token in tokens)
+ assert parsed_text == text, 'Input and output texts must match!'
+
+
+@pytest.mark.parametrize(
+ "text, expected",
+ (
+ # Unfinished comment openers
+ ('/', (0, Error, '/')),
+ ('1/', (1, Error, '/')),
+ ('/1', (0, Error, '/')),
+ ('""/', (2, Error, '/')),
+ # Unclosed multiline comments
+ ('/*', (0, Error, '/*')),
+ ('/**', (0, Error, '/**')),
+ ('/*/', (0, Error, '/*/')),
+ ('1/*', (1, Error, '/*')),
+ ('""/*', (2, Error, '/*')),
+ ('""/**', (2, Error, '/**')),
+ )
+)
+def test_json_comments_negative_matches(lexer_json, text, expected):
+ """Verify that the unfinished or unclosed comments are parsed as errors."""
+
+ tokens = list(lexer_json.get_tokens_unprocessed(text))
+ assert expected in tokens
+
+ parsed_text = ''.join(token[2] for token in tokens)
+ assert parsed_text == text, 'Input and output texts must match!'
+
+
+def test_json_escape_backtracking(lexer_json):
+ """Confirm that there is no catastrophic backtracking in the lexer.
+
+ This no longer applies because the JSON lexer doesn't use regular expressions,
+ but the test is included to ensure no loss of functionality now or in the future.
+ """
+
+ fragment = r'{"\u00D0000\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\63CD'
+ start_time = time.time()
+ list(lexer_json.get_tokens(fragment))
+ assert time.time() - start_time < 1, 'The JSON lexer may have catastrophic backtracking'
+
+
+@pytest.mark.parametrize(
+ 'keyword',
+ (
+ 'base',
+ 'container',
+ 'context',
+ 'direction',
+ 'graph',
+ 'id',
+ 'import',
+ 'included',
+ 'index',
+ 'json',
+ 'language',
+ 'list',
+ 'nest',
+ 'none',
+ 'prefix',
+ 'propagate',
+ 'protected',
+ 'reverse',
+ 'set',
+ 'type',
+ 'value',
+ 'version',
+ 'vocab',
+ )
+)
+def test_json_ld_keywords_positive_match(lexer_json_ld, keyword):
+ """Validate that JSON-LD keywords are parsed correctly."""
+
+ tokens = list(lexer_json_ld.get_tokens_unprocessed('{"@%s": ""}' % keyword))
+ assert len(tokens) == 6
+ assert tokens[1][1] is Token.Name.Decorator
+ assert tokens[1][2] == '"@%s"' % keyword
+
+
+@pytest.mark.parametrize(
+ 'keyword',
+ (
+ '@bogus', # "@" does not guarantee a keyword match
+ '@bases', # Begins with the keyword "@base"
+ 'container', # Matches "container" but has no leading "@"
+ )
+)
+def test_json_ld_keywords_negative_match(lexer_json_ld, keyword):
+ """Validate that JSON-LD non-keywords are parsed correctly."""
+
+ tokens = list(lexer_json_ld.get_tokens_unprocessed('{"%s": ""}' % keyword))
+ assert len(tokens) == 6
+ assert tokens[1][1] is Token.Name.Tag
+ assert tokens[1][2] == '"%s"' % keyword