summaryrefslogtreecommitdiffstats
path: root/python/samba/tests/source_chars.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/samba/tests/source_chars.py')
-rwxr-xr-xpython/samba/tests/source_chars.py326
1 files changed, 326 insertions, 0 deletions
diff --git a/python/samba/tests/source_chars.py b/python/samba/tests/source_chars.py
new file mode 100755
index 0000000..4613088
--- /dev/null
+++ b/python/samba/tests/source_chars.py
@@ -0,0 +1,326 @@
+#!/usr/bin/env python3
+# Unix SMB/CIFS implementation.
+#
+# Copyright (C) Catalyst.Net Ltd. 2021
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import sys
+
+sys.path.insert(0, 'bin/python')
+os.environ['PYTHONUNBUFFERED'] = '1'
+
+import subprocess
+from collections import Counter
+from samba.colour import c_RED, c_GREEN, c_DARK_YELLOW, switch_colour_off
+import re
+import unicodedata as u
+from samba.tests import TestCase, SkipTest
+
+if not sys.stdout.isatty():
+ switch_colour_off()
+
+
+def _find_root():
+ try:
+ p = subprocess.run(['git', 'rev-parse', '--show-toplevel'],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ timeout=10)
+ except subprocess.CalledProcessError as err:
+ print(c_RED("Error running git (is this a git tree?): %s" % (err)))
+
+ SkipTest("This test is only useful in a git working tree")
+ sys.exit(0)
+
+ if p.returncode != 0:
+ raise SkipTest("This test is only useful in a git working tree")
+ sys.exit(0)
+
+ root = p.stdout.decode().strip()
+
+ should_be_roots = (
+ os.path.abspath(os.path.join(os.path.dirname(__file__),
+ "../../..")),
+ os.path.abspath(os.path.join(os.path.dirname(__file__),
+ "../../../..")),
+ )
+ if root not in should_be_roots:
+ print(c_RED("It looks like we have found the wrong git tree!"))
+ sys.exit(1)
+ return root
+
+
+ROOT = None
+
+IGNORED_FILES = (
+ 'source3/selftest/ktest-krb5_ccache-2',
+ 'source3/selftest/ktest-krb5_ccache-3',
+ 'testdata/source-chars-bad.c',
+)
+
+IGNORED_RE = (
+ r'^third_party/heimdal/lib/hcrypto/passwd_dialog',
+ r'^third_party/heimdal/lib/hx509/data/',
+ r'^third_party/heimdal/po',
+ r'^third_party/heimdal/tests/kdc/hdb-mitdb',
+ r'^testdata/compression/',
+ r'^third_party/heimdal/lib/asn1/fuzz-inputs/',
+)
+
+IGNORED_EXTENSIONS = {
+ 'bmp',
+ 'cer',
+ 'corrupt',
+ 'crl',
+ 'crt',
+ 'dat',
+ 'der',
+ 'dump',
+ 'gpg',
+ 'gz',
+ 'ico',
+ 'keytab',
+ 'ldb',
+ 'p12',
+ 'pdf',
+ 'pem',
+ 'png',
+ 'SAMBABACKUP',
+ 'sxd',
+ 'tdb',
+ 'tif',
+ 'reg',
+ 'req'
+}
+
+
+# This list is by no means exhaustive -- these are just the format
+# characters we actually use.
+SAFE_FORMAT_CHARS = {
+ '\u200b',
+ '\ufeff'
+}
+
+# These files legitimately mix left-to-right and right-to-left text.
+# In the real world mixing directions would be normal in bilingual
+# documents, but it is rare in Samba source code.
+BIDI_FILES = {
+ 'third_party/heimdal/lib/base/test_base.c',
+ 'third_party/heimdal/lib/wind/NormalizationTest.txt',
+ 'testdata/source-chars-bidi.py',
+}
+
+
+def get_git_files():
+ try:
+ p = subprocess.run(['git',
+ '-C', ROOT,
+ 'ls-files',
+ '-z'],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ timeout=10)
+ except subprocess.SubprocessError as e:
+ print(c_RED(f"Error running git (is this a git tree?): {e}"))
+ print("This test is only useful in a git working tree")
+ return []
+
+ filenames = p.stdout.split(b'\x00')
+ return [x.decode() for x in filenames[:-1]]
+
+
+def iter_source_files():
+ filenames = get_git_files()
+
+ for name in filenames:
+ ignore = False
+ if name in IGNORED_FILES:
+ print(c_DARK_YELLOW(f"ignoring (exact) {name}"))
+ continue
+
+ for ignored in IGNORED_RE:
+ ignore = (re.match(ignored, name))
+ if ignore:
+ break
+
+ if ignore:
+ print(c_DARK_YELLOW(f"ignoring (via RE) {name}"))
+ continue
+
+ if '.' in name:
+ ext = name.rsplit('.', 1)[1]
+ if ext in IGNORED_EXTENSIONS:
+ print(c_DARK_YELLOW(f"ignoring {name}"))
+ continue
+
+ yield name
+
+
+def is_latin1_file(name):
+ for pattern in (
+ r'^source4/setup/ad-schema/\w+.ldf$',
+ r'^source4/setup/display-specifiers/D[\w-]+.txt$',
+ r'^third_party/heimdal/cf/pkg.m4$',
+ r'^third_party/heimdal/doc/standardisation/',
+ ):
+ if re.match(pattern, name):
+ return True
+ return False
+
+
+def is_bad_latin1_file(fullname):
+ # In practice, the few latin-1 files we have have single non-ASCII
+ # byte islands in a sea of ASCII. The utf-8 sequences we are
+ # concerned about involve sequences of 3 high bytes. We can say a
+ # file is safe latin-1 if it has only individual high bytes.
+ with open(fullname, 'rb') as f:
+ b = f.read()
+ in_seq = False
+ for c in b:
+ if c > 0x7f:
+ if in_seq:
+ return True
+ in_seq = True
+ else:
+ in_seq = False
+ return False
+
+
+def is_bad_char(c):
+ if u.category(c) != 'Cf':
+ return False
+ if c in SAFE_FORMAT_CHARS:
+ return False
+ return True
+
+
+class CharacterTests(TestCase):
+ def setUp(self):
+ global ROOT
+ if not ROOT:
+ ROOT = _find_root()
+
+ def test_no_unexpected_format_chars(self):
+ """This test tries to ensure that no source file has unicode control
+ characters that can change the apparent order of other
+ characters. These characters could make code appear to have
+ different semantic meaning it really does.
+
+ This issue is sometimes called "Trojan Source", "CVE-2021-42574",
+ or "CVE-2021-42694".
+ """
+ for name in iter_source_files():
+ fullname = os.path.join(ROOT, name)
+ try:
+ with open(fullname) as f:
+ s = f.read()
+ except UnicodeDecodeError as e:
+ # probably a latin-1 encoding, which we tolerate in a few
+ # files for historical reasons, though we check that there
+ # are not long sequences of high bytes.
+ if is_latin1_file(name):
+ if is_bad_latin1_file(fullname):
+ self.fail(f"latin-1 file {name} has long sequences "
+ "of high bytes")
+ else:
+ self.fail(f"could not decode {name}: {e}")
+
+ dirs = set()
+ for c in set(s):
+ if is_bad_char(c):
+ self.fail(f"{name} has potentially bad format characters!")
+ dirs.add(u.bidirectional(c))
+
+ if 'L' in dirs and 'R' in dirs:
+ if name not in BIDI_FILES:
+ self.fail(f"{name} has LTR and RTL text ({dirs})")
+
+ def test_unexpected_format_chars_do_fail(self):
+ """Test the test"""
+ for name, n_bad in [
+ ('testdata/source-chars-bad.c', 3)
+ ]:
+ fullname = os.path.join(ROOT, name)
+ with open(fullname) as f:
+ s = f.read()
+ chars = set(s)
+ bad_chars = [c for c in chars if is_bad_char(c)]
+ self.assertEqual(len(bad_chars), n_bad)
+
+ def test_unexpected_bidi_fails(self):
+ """Test the test"""
+ for name in [
+ 'testdata/source-chars-bidi.py'
+ ]:
+ fullname = os.path.join(ROOT, name)
+ with open(fullname) as f:
+ s = f.read()
+
+ dirs = set()
+ for c in set(s):
+ dirs.add(u.bidirectional(c))
+ self.assertIn('L', dirs)
+ self.assertIn('R', dirs)
+
+
+def check_file_text():
+ """If called directly as a script, count the found characters."""
+ global ROOT
+ if not ROOT:
+ ROOT = _find_root()
+
+ counts = Counter()
+ for name in iter_source_files():
+ fullname = os.path.join(ROOT, name)
+ try:
+ with open(fullname) as f:
+ s = f.read()
+ except UnicodeDecodeError as e:
+ if is_latin1_file(name):
+ if is_bad_latin1_file(fullname):
+ print(c_RED(f"latin-1 file {name} has long sequences "
+ "of high bytes"))
+ else:
+ print(c_GREEN(f"latin-1 file {name} is fine"))
+ else:
+ print(c_RED(f"can't read {name}: {e}"))
+
+ counts.update(s)
+ chars = set(s)
+ for c in chars:
+ if u.category(c) == 'Cf':
+ print(c_GREEN(f"{name} has {u.name(c)}"))
+
+ print(len(counts))
+ controls = []
+ formats = []
+ others = []
+ for x in counts:
+ c = u.category(x)
+ if c == 'Cc':
+ controls.append(x)
+ elif c == 'Cf':
+ formats.append(x)
+ elif c[0] == 'C':
+ others.append(x)
+
+ print(f"normal control characters {controls}")
+ print(f"format characters {formats}")
+ print(f"other control characters {others}")
+
+
+if __name__ == '__main__':
+ check_file_text()