diff options
Diffstat (limited to 'python/samba/tests/source_chars.py')
-rwxr-xr-x | python/samba/tests/source_chars.py | 326 |
1 files changed, 326 insertions, 0 deletions
diff --git a/python/samba/tests/source_chars.py b/python/samba/tests/source_chars.py new file mode 100755 index 0000000..4613088 --- /dev/null +++ b/python/samba/tests/source_chars.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +# Unix SMB/CIFS implementation. +# +# Copyright (C) Catalyst.Net Ltd. 2021 +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import os +import sys + +sys.path.insert(0, 'bin/python') +os.environ['PYTHONUNBUFFERED'] = '1' + +import subprocess +from collections import Counter +from samba.colour import c_RED, c_GREEN, c_DARK_YELLOW, switch_colour_off +import re +import unicodedata as u +from samba.tests import TestCase, SkipTest + +if not sys.stdout.isatty(): + switch_colour_off() + + +def _find_root(): + try: + p = subprocess.run(['git', 'rev-parse', '--show-toplevel'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=10) + except subprocess.CalledProcessError as err: + print(c_RED("Error running git (is this a git tree?): %s" % (err))) + + SkipTest("This test is only useful in a git working tree") + sys.exit(0) + + if p.returncode != 0: + raise SkipTest("This test is only useful in a git working tree") + sys.exit(0) + + root = p.stdout.decode().strip() + + should_be_roots = ( + os.path.abspath(os.path.join(os.path.dirname(__file__), + "../../..")), + os.path.abspath(os.path.join(os.path.dirname(__file__), + "../../../..")), + ) + if root not in should_be_roots: + print(c_RED("It looks like we have found the wrong git tree!")) + sys.exit(1) + return root + + +ROOT = None + +IGNORED_FILES = ( + 'source3/selftest/ktest-krb5_ccache-2', + 'source3/selftest/ktest-krb5_ccache-3', + 'testdata/source-chars-bad.c', +) + +IGNORED_RE = ( + r'^third_party/heimdal/lib/hcrypto/passwd_dialog', + r'^third_party/heimdal/lib/hx509/data/', + r'^third_party/heimdal/po', + r'^third_party/heimdal/tests/kdc/hdb-mitdb', + r'^testdata/compression/', + r'^third_party/heimdal/lib/asn1/fuzz-inputs/', +) + +IGNORED_EXTENSIONS = { + 'bmp', + 'cer', + 'corrupt', + 'crl', + 'crt', + 'dat', + 'der', + 'dump', + 'gpg', + 'gz', + 'ico', + 'keytab', + 'ldb', + 'p12', + 'pdf', + 'pem', + 'png', + 'SAMBABACKUP', + 'sxd', + 'tdb', + 'tif', + 'reg', + 'req' +} + + +# This list is by no means exhaustive -- these are just the format +# characters we actually use. +SAFE_FORMAT_CHARS = { + '\u200b', + '\ufeff' +} + +# These files legitimately mix left-to-right and right-to-left text. +# In the real world mixing directions would be normal in bilingual +# documents, but it is rare in Samba source code. +BIDI_FILES = { + 'third_party/heimdal/lib/base/test_base.c', + 'third_party/heimdal/lib/wind/NormalizationTest.txt', + 'testdata/source-chars-bidi.py', +} + + +def get_git_files(): + try: + p = subprocess.run(['git', + '-C', ROOT, + 'ls-files', + '-z'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=10) + except subprocess.SubprocessError as e: + print(c_RED(f"Error running git (is this a git tree?): {e}")) + print("This test is only useful in a git working tree") + return [] + + filenames = p.stdout.split(b'\x00') + return [x.decode() for x in filenames[:-1]] + + +def iter_source_files(): + filenames = get_git_files() + + for name in filenames: + ignore = False + if name in IGNORED_FILES: + print(c_DARK_YELLOW(f"ignoring (exact) {name}")) + continue + + for ignored in IGNORED_RE: + ignore = (re.match(ignored, name)) + if ignore: + break + + if ignore: + print(c_DARK_YELLOW(f"ignoring (via RE) {name}")) + continue + + if '.' in name: + ext = name.rsplit('.', 1)[1] + if ext in IGNORED_EXTENSIONS: + print(c_DARK_YELLOW(f"ignoring {name}")) + continue + + yield name + + +def is_latin1_file(name): + for pattern in ( + r'^source4/setup/ad-schema/\w+.ldf$', + r'^source4/setup/display-specifiers/D[\w-]+.txt$', + r'^third_party/heimdal/cf/pkg.m4$', + r'^third_party/heimdal/doc/standardisation/', + ): + if re.match(pattern, name): + return True + return False + + +def is_bad_latin1_file(fullname): + # In practice, the few latin-1 files we have have single non-ASCII + # byte islands in a sea of ASCII. The utf-8 sequences we are + # concerned about involve sequences of 3 high bytes. We can say a + # file is safe latin-1 if it has only individual high bytes. + with open(fullname, 'rb') as f: + b = f.read() + in_seq = False + for c in b: + if c > 0x7f: + if in_seq: + return True + in_seq = True + else: + in_seq = False + return False + + +def is_bad_char(c): + if u.category(c) != 'Cf': + return False + if c in SAFE_FORMAT_CHARS: + return False + return True + + +class CharacterTests(TestCase): + def setUp(self): + global ROOT + if not ROOT: + ROOT = _find_root() + + def test_no_unexpected_format_chars(self): + """This test tries to ensure that no source file has unicode control + characters that can change the apparent order of other + characters. These characters could make code appear to have + different semantic meaning it really does. + + This issue is sometimes called "Trojan Source", "CVE-2021-42574", + or "CVE-2021-42694". + """ + for name in iter_source_files(): + fullname = os.path.join(ROOT, name) + try: + with open(fullname) as f: + s = f.read() + except UnicodeDecodeError as e: + # probably a latin-1 encoding, which we tolerate in a few + # files for historical reasons, though we check that there + # are not long sequences of high bytes. + if is_latin1_file(name): + if is_bad_latin1_file(fullname): + self.fail(f"latin-1 file {name} has long sequences " + "of high bytes") + else: + self.fail(f"could not decode {name}: {e}") + + dirs = set() + for c in set(s): + if is_bad_char(c): + self.fail(f"{name} has potentially bad format characters!") + dirs.add(u.bidirectional(c)) + + if 'L' in dirs and 'R' in dirs: + if name not in BIDI_FILES: + self.fail(f"{name} has LTR and RTL text ({dirs})") + + def test_unexpected_format_chars_do_fail(self): + """Test the test""" + for name, n_bad in [ + ('testdata/source-chars-bad.c', 3) + ]: + fullname = os.path.join(ROOT, name) + with open(fullname) as f: + s = f.read() + chars = set(s) + bad_chars = [c for c in chars if is_bad_char(c)] + self.assertEqual(len(bad_chars), n_bad) + + def test_unexpected_bidi_fails(self): + """Test the test""" + for name in [ + 'testdata/source-chars-bidi.py' + ]: + fullname = os.path.join(ROOT, name) + with open(fullname) as f: + s = f.read() + + dirs = set() + for c in set(s): + dirs.add(u.bidirectional(c)) + self.assertIn('L', dirs) + self.assertIn('R', dirs) + + +def check_file_text(): + """If called directly as a script, count the found characters.""" + global ROOT + if not ROOT: + ROOT = _find_root() + + counts = Counter() + for name in iter_source_files(): + fullname = os.path.join(ROOT, name) + try: + with open(fullname) as f: + s = f.read() + except UnicodeDecodeError as e: + if is_latin1_file(name): + if is_bad_latin1_file(fullname): + print(c_RED(f"latin-1 file {name} has long sequences " + "of high bytes")) + else: + print(c_GREEN(f"latin-1 file {name} is fine")) + else: + print(c_RED(f"can't read {name}: {e}")) + + counts.update(s) + chars = set(s) + for c in chars: + if u.category(c) == 'Cf': + print(c_GREEN(f"{name} has {u.name(c)}")) + + print(len(counts)) + controls = [] + formats = [] + others = [] + for x in counts: + c = u.category(x) + if c == 'Cc': + controls.append(x) + elif c == 'Cf': + formats.append(x) + elif c[0] == 'C': + others.append(x) + + print(f"normal control characters {controls}") + print(f"format characters {formats}") + print(f"other control characters {others}") + + +if __name__ == '__main__': + check_file_text() |