diff options
Diffstat (limited to 'third_party/python/binaryornot')
9 files changed, 512 insertions, 0 deletions
diff --git a/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/DESCRIPTION.rst b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/DESCRIPTION.rst new file mode 100644 index 0000000000..4ef0073431 --- /dev/null +++ b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/DESCRIPTION.rst @@ -0,0 +1,152 @@ +============================= +BinaryOrNot +============================= + +.. image:: https://img.shields.io/pypi/v/binaryornot.svg?style=flat + :target: https://pypi.python.org/pypi/binaryornot + +.. image:: https://readthedocs.org/projects/binaryornot/badge/?version=latest + :target: http://binaryornot.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + + +.. image:: https://pyup.io/repos/github/audreyr/binaryornot/shield.svg + :target: https://pyup.io/repos/github/audreyr/binaryornot/ + :alt: Updates + +Ultra-lightweight pure Python package to guess whether a file is binary or text, +using a heuristic similar to Perl's `pp_fttext` and its analysis by @eliben. + +* Free software: BSD license +* Documentation: https://binaryornot.readthedocs.io + +Status +------ + +It works, and people are using this package in various places. But it doesn't cover all edge cases yet. + +The code could be improved. Pull requests welcome! As of now, it is based on these snippets, but that may change: + +* http://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python +* http://stackoverflow.com/questions/1446549/how-to-identify-binary-and-text-files-using-python +* http://code.activestate.com/recipes/173220/ +* http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/ + +Features +-------- + +Has tests for these file types: + +* Text: .txt, .css, .json, .svg, .js, .lua, .pl, .rst +* Binary: .png, .gif, .jpg, .tiff, .bmp, .DS_Store, .eot, .otf, .ttf, .woff, .rgb + +Has tests for numerous encodings. + +Why? +---- + +You may be thinking, "I can write this in 2 lines of code?!" + +It's actually not that easy. Here's a great article about how Perl's +heuristic to guess file types works: http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/ + +And that's just where we started. Over time, we've found more edge cases and +our heuristic has gotten more complex. + +Also, this package saves you from having to write and thoroughly test +your code with all sorts of weird file types and encodings, cross-platform. + +Builds +------ + +Linux (Ubuntu 12.04 LTS Server Edition 64 bit): + +.. image:: https://img.shields.io/travis/audreyr/binaryornot/master.svg + :target: https://travis-ci.org/audreyr/binaryornot + +Windows (Windows Server 2012 R2 (x64)): + +.. image:: https://img.shields.io/appveyor/ci/audreyr/binaryornot/master.svg + :target: https://ci.appveyor.com/project/audreyr/binaryornot + +Credits +------- + +* Special thanks to Eli Bendersky (@eliben) for his writeup explaining the heuristic and his implementation, which this is largely based on. +* Source code from the portion of Perl's `pp_fttext` that checks for textiness: https://github.com/Perl/perl5/blob/v5.23.1/pp_sys.c#L3527-L3587 + + + + +History +------- + +0.4.4 (2017-04-13) +~~~~~~~~~~~~~~~~~~ + +* Notify users for file i/o issues. Thanks @lukehinds! + + +0.4.3 (2017-04-13) +~~~~~~~~~~~~~~~~~~ + +* Restricted chardet to anything 3.0.2 or higher due to https://github.com/chardet/chardet/issues/113. Thanks @dan-blanchard for the quick fix! + +0.4.2 (2017-04-12) +~~~~~~~~~~~~~~~~~~ + +* Restricted chardet to anything under 3.0 due to https://github.com/chardet/chardet/issues/113 +* Added pyup badge +* Added utilities for pushing new versions up + +0.4.0 (2015-08-21) +~~~~~~~~~~~~~~~~~~ + +* Enhanced detection for some binary streams and UTF texts. (#10, 11) Thanks `@pombredanne`_. +* Set up Appveyor for continuous testing on Windows. Thanks `@pydanny`_. +* Update link to Perl source implementation. (#9) Thanks `@asmeurer`_ `@pombredanne`_ `@audreyr`_. +* Handle UnicodeDecodeError in check. (#12) Thanks `@DRMacIver`_. +* Add very simple Hypothesis based tests. (#13) Thanks `@DRMacIver`_. +* Use setup to determine requirements and remove redundant requirements.txt. (#14) Thanks `@hackebrot`_. +* Add documentation status badge to README.rst. (#15) Thanks `@hackebrot`_. +* Run tox in travis.yml. Add pypy and Python 3.4 to tox environments. (#16) Thanks `@hackebrot`_ `@pydanny`_. +* Handle LookupError when detecting encoding. (#17) Thanks `@DRMacIver`_. + + +.. _`@pombredanne`: https://github.com/pombredanne +.. _`@pydanny`: https://github.com/pydanny +.. _`@asmeurer`: https://github.com/asmeurer +.. _`@audreyr`: https://github.com/audreyr +.. _`@DRMacIver`: https://github.com/DRMacIver +.. _`@hackebrot`: https://github.com/hackebrot + +0.3.0 (2014-05-05) +~~~~~~~~~~~~~~~~~~ + +* Include tests, docs in source package. (#6) Thanks `@vincentbernat`_. +* Drop unnecessary shebangs and executable bits. (#8) Thanks `@scop`_. +* Generate string of printable extended ASCII bytes only once. (#7) Thanks `@scop`_. +* Make number of bytes to read parametrizable. (#7) Thanks `@scop`_. + +.. _`@vincentbernat`: https://github.com/vincentbernat +.. _`@scop`: https://github.com/scop + +0.2.0 (2013-09-22) +~~~~~~~~~~~~~~~~~~ + +* Complete rewrite of everything. Thanks `@ncoghlan`_. + +.. _`@ncoghlan`: https://github.com/ncoghlan + +0.1.1 (2013-08-17) +~~~~~~~~~~~~~~~~~~ + +* Tests pass under Python 2.6, 2.7, 3.3, PyPy. + + +0.1.0 (2013-08-17) +~~~~~~~~~~~~~~~~~~ + +* First release on PyPI. + + diff --git a/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/METADATA b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/METADATA new file mode 100644 index 0000000000..cabfe74587 --- /dev/null +++ b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/METADATA @@ -0,0 +1,175 @@ +Metadata-Version: 2.0 +Name: binaryornot +Version: 0.4.4 +Summary: Ultra-lightweight pure Python package to check if a file is binary or text. +Home-page: https://github.com/audreyr/binaryornot +Author: Audrey Roy Greenfeld +Author-email: aroy@alum.mit.edu +License: BSD +Keywords: binaryornot +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Natural Language :: English +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.3 +Classifier: Programming Language :: Python :: 3.4 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Requires-Dist: chardet (>=3.0.2) + +============================= +BinaryOrNot +============================= + +.. image:: https://img.shields.io/pypi/v/binaryornot.svg?style=flat + :target: https://pypi.python.org/pypi/binaryornot + +.. image:: https://readthedocs.org/projects/binaryornot/badge/?version=latest + :target: http://binaryornot.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + + +.. image:: https://pyup.io/repos/github/audreyr/binaryornot/shield.svg + :target: https://pyup.io/repos/github/audreyr/binaryornot/ + :alt: Updates + +Ultra-lightweight pure Python package to guess whether a file is binary or text, +using a heuristic similar to Perl's `pp_fttext` and its analysis by @eliben. + +* Free software: BSD license +* Documentation: https://binaryornot.readthedocs.io + +Status +------ + +It works, and people are using this package in various places. But it doesn't cover all edge cases yet. + +The code could be improved. Pull requests welcome! As of now, it is based on these snippets, but that may change: + +* http://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python +* http://stackoverflow.com/questions/1446549/how-to-identify-binary-and-text-files-using-python +* http://code.activestate.com/recipes/173220/ +* http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/ + +Features +-------- + +Has tests for these file types: + +* Text: .txt, .css, .json, .svg, .js, .lua, .pl, .rst +* Binary: .png, .gif, .jpg, .tiff, .bmp, .DS_Store, .eot, .otf, .ttf, .woff, .rgb + +Has tests for numerous encodings. + +Why? +---- + +You may be thinking, "I can write this in 2 lines of code?!" + +It's actually not that easy. Here's a great article about how Perl's +heuristic to guess file types works: http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/ + +And that's just where we started. Over time, we've found more edge cases and +our heuristic has gotten more complex. + +Also, this package saves you from having to write and thoroughly test +your code with all sorts of weird file types and encodings, cross-platform. + +Builds +------ + +Linux (Ubuntu 12.04 LTS Server Edition 64 bit): + +.. image:: https://img.shields.io/travis/audreyr/binaryornot/master.svg + :target: https://travis-ci.org/audreyr/binaryornot + +Windows (Windows Server 2012 R2 (x64)): + +.. image:: https://img.shields.io/appveyor/ci/audreyr/binaryornot/master.svg + :target: https://ci.appveyor.com/project/audreyr/binaryornot + +Credits +------- + +* Special thanks to Eli Bendersky (@eliben) for his writeup explaining the heuristic and his implementation, which this is largely based on. +* Source code from the portion of Perl's `pp_fttext` that checks for textiness: https://github.com/Perl/perl5/blob/v5.23.1/pp_sys.c#L3527-L3587 + + + + +History +------- + +0.4.4 (2017-04-13) +~~~~~~~~~~~~~~~~~~ + +* Notify users for file i/o issues. Thanks @lukehinds! + + +0.4.3 (2017-04-13) +~~~~~~~~~~~~~~~~~~ + +* Restricted chardet to anything 3.0.2 or higher due to https://github.com/chardet/chardet/issues/113. Thanks @dan-blanchard for the quick fix! + +0.4.2 (2017-04-12) +~~~~~~~~~~~~~~~~~~ + +* Restricted chardet to anything under 3.0 due to https://github.com/chardet/chardet/issues/113 +* Added pyup badge +* Added utilities for pushing new versions up + +0.4.0 (2015-08-21) +~~~~~~~~~~~~~~~~~~ + +* Enhanced detection for some binary streams and UTF texts. (#10, 11) Thanks `@pombredanne`_. +* Set up Appveyor for continuous testing on Windows. Thanks `@pydanny`_. +* Update link to Perl source implementation. (#9) Thanks `@asmeurer`_ `@pombredanne`_ `@audreyr`_. +* Handle UnicodeDecodeError in check. (#12) Thanks `@DRMacIver`_. +* Add very simple Hypothesis based tests. (#13) Thanks `@DRMacIver`_. +* Use setup to determine requirements and remove redundant requirements.txt. (#14) Thanks `@hackebrot`_. +* Add documentation status badge to README.rst. (#15) Thanks `@hackebrot`_. +* Run tox in travis.yml. Add pypy and Python 3.4 to tox environments. (#16) Thanks `@hackebrot`_ `@pydanny`_. +* Handle LookupError when detecting encoding. (#17) Thanks `@DRMacIver`_. + + +.. _`@pombredanne`: https://github.com/pombredanne +.. _`@pydanny`: https://github.com/pydanny +.. _`@asmeurer`: https://github.com/asmeurer +.. _`@audreyr`: https://github.com/audreyr +.. _`@DRMacIver`: https://github.com/DRMacIver +.. _`@hackebrot`: https://github.com/hackebrot + +0.3.0 (2014-05-05) +~~~~~~~~~~~~~~~~~~ + +* Include tests, docs in source package. (#6) Thanks `@vincentbernat`_. +* Drop unnecessary shebangs and executable bits. (#8) Thanks `@scop`_. +* Generate string of printable extended ASCII bytes only once. (#7) Thanks `@scop`_. +* Make number of bytes to read parametrizable. (#7) Thanks `@scop`_. + +.. _`@vincentbernat`: https://github.com/vincentbernat +.. _`@scop`: https://github.com/scop + +0.2.0 (2013-09-22) +~~~~~~~~~~~~~~~~~~ + +* Complete rewrite of everything. Thanks `@ncoghlan`_. + +.. _`@ncoghlan`: https://github.com/ncoghlan + +0.1.1 (2013-08-17) +~~~~~~~~~~~~~~~~~~ + +* Tests pass under Python 2.6, 2.7, 3.3, PyPy. + + +0.1.0 (2013-08-17) +~~~~~~~~~~~~~~~~~~ + +* First release on PyPI. + + diff --git a/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/RECORD b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/RECORD new file mode 100644 index 0000000000..ce88709fdf --- /dev/null +++ b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/RECORD @@ -0,0 +1,9 @@ +binaryornot/__init__.py,sha256=XfSXEYNIhIlBmsiUBzn8nDNSUA-2P5iseHo6sD1ZlyE,80
+binaryornot/check.py,sha256=Doh9gd_DUYWdGpJ8CCMmu4A-bul3_kV3qnl4OsgDp10,756
+binaryornot/helpers.py,sha256=p6aDimVStPQKZeQNHKkQNneeD8jfT4qpm9hCiV_8jYU,4737
+binaryornot-0.4.4.dist-info/DESCRIPTION.rst,sha256=QGOp8ciWl3QJXmtUSCWTYNLFy4exWZ4IIieSuJ1YWmQ,5120
+binaryornot-0.4.4.dist-info/METADATA,sha256=ZbKKAHfl5XQchYqQmoQ6sE7ya3RfAyNBECqYmmtany8,5995
+binaryornot-0.4.4.dist-info/RECORD,,
+binaryornot-0.4.4.dist-info/WHEEL,sha256=o2k-Qa-RMNIJmUdIc7KU6VWR_ErNRbWNlxDIpl7lm34,110
+binaryornot-0.4.4.dist-info/metadata.json,sha256=MDiTh918QrO8YioOU2Rjh-6NO2xehibwl5nSc0NJWWg,1022
+binaryornot-0.4.4.dist-info/top_level.txt,sha256=xSk7ScGP__GIh_D2caJfJk0oRzgpyyqjWiozi5_nvms,12
diff --git a/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/WHEEL b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/WHEEL new file mode 100644 index 0000000000..8b6dd1b5a8 --- /dev/null +++ b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/WHEEL @@ -0,0 +1,6 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.29.0) +Root-Is-Purelib: true +Tag: py2-none-any +Tag: py3-none-any + diff --git a/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/metadata.json b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/metadata.json new file mode 100644 index 0000000000..ac09fa3963 --- /dev/null +++ b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/metadata.json @@ -0,0 +1 @@ +{"classifiers": ["Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Natural Language :: English", "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6"], "extensions": {"python.details": {"contacts": [{"email": "aroy@alum.mit.edu", "name": "Audrey Roy Greenfeld", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}, "project_urls": {"Home": "https://github.com/audreyr/binaryornot"}}}, "extras": [], "generator": "bdist_wheel (0.29.0)", "keywords": ["binaryornot"], "license": "BSD", "metadata_version": "2.0", "name": "binaryornot", "run_requires": [{"requires": ["chardet (>=3.0.2)"]}], "summary": "Ultra-lightweight pure Python package to check if a file is binary or text.", "version": "0.4.4"}
\ No newline at end of file diff --git a/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/top_level.txt b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/top_level.txt new file mode 100644 index 0000000000..62c9ba1d6a --- /dev/null +++ b/third_party/python/binaryornot/binaryornot-0.4.4.dist-info/top_level.txt @@ -0,0 +1 @@ +binaryornot diff --git a/third_party/python/binaryornot/binaryornot/__init__.py b/third_party/python/binaryornot/binaryornot/__init__.py new file mode 100644 index 0000000000..518255b16b --- /dev/null +++ b/third_party/python/binaryornot/binaryornot/__init__.py @@ -0,0 +1,3 @@ +__author__ = 'Audrey Roy' +__email__ = 'audreyr@gmail.com' +__version__ = '0.4.4' diff --git a/third_party/python/binaryornot/binaryornot/check.py b/third_party/python/binaryornot/binaryornot/check.py new file mode 100644 index 0000000000..a784e3a77f --- /dev/null +++ b/third_party/python/binaryornot/binaryornot/check.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- + +""" +binaryornot.check +----------------- + +Main code for checking if a file is binary or text. +""" + +import logging + +from .helpers import get_starting_chunk, is_binary_string + + +logger = logging.getLogger(__name__) + + +def is_binary(filename): + """ + :param filename: File to check. + :returns: True if it's a binary file, otherwise False. + """ + logger.debug('is_binary: %(filename)r', locals()) + + # Check if the file extension is in a list of known binary types + binary_extensions = ['.pyc', ] + for ext in binary_extensions: + if filename.endswith(ext): + return True + + # Check if the starting chunk is a binary string + chunk = get_starting_chunk(filename) + return is_binary_string(chunk) diff --git a/third_party/python/binaryornot/binaryornot/helpers.py b/third_party/python/binaryornot/binaryornot/helpers.py new file mode 100644 index 0000000000..3f034a695a --- /dev/null +++ b/third_party/python/binaryornot/binaryornot/helpers.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- + + +""" +binaryornot.helpers +------------------- + +Helper utilities used by BinaryOrNot. +""" + +import chardet +import logging + + +logger = logging.getLogger(__name__) + + +def print_as_hex(s): + """ + Print a string as hex bytes. + """ + print(":".join("{0:x}".format(ord(c)) for c in s)) + + +def get_starting_chunk(filename, length=1024): + """ + :param filename: File to open and get the first little chunk of. + :param length: Number of bytes to read, default 1024. + :returns: Starting chunk of bytes. + """ + # Ensure we open the file in binary mode + try: + with open(filename, 'rb') as f: + chunk = f.read(length) + return chunk + except IOError as e: + print(e) + + +_control_chars = b'\n\r\t\f\b' +if bytes is str: + # Python 2 means we need to invoke chr() explicitly + _printable_ascii = _control_chars + b''.join(map(chr, range(32, 127))) + _printable_high_ascii = b''.join(map(chr, range(127, 256))) +else: + # Python 3 means bytes accepts integer input directly + _printable_ascii = _control_chars + bytes(range(32, 127)) + _printable_high_ascii = bytes(range(127, 256)) + + +def is_binary_string(bytes_to_check): + """ + Uses a simplified version of the Perl detection algorithm, + based roughly on Eli Bendersky's translation to Python: + http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/ + + This is biased slightly more in favour of deeming files as text + files than the Perl algorithm, since all ASCII compatible character + sets are accepted as text, not just utf-8. + + :param bytes: A chunk of bytes to check. + :returns: True if appears to be a binary, otherwise False. + """ + + # Empty files are considered text files + if not bytes_to_check: + return False + + # Now check for a high percentage of ASCII control characters + # Binary if control chars are > 30% of the string + low_chars = bytes_to_check.translate(None, _printable_ascii) + nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check)) + logger.debug('nontext_ratio1: %(nontext_ratio1)r', locals()) + + # and check for a low percentage of high ASCII characters: + # Binary if high ASCII chars are < 5% of the string + # From: https://en.wikipedia.org/wiki/UTF-8 + # If the bytes are random, the chances of a byte with the high bit set + # starting a valid UTF-8 character is only 6.64%. The chances of finding 7 + # of these without finding an invalid sequence is actually lower than the + # chance of the first three bytes randomly being the UTF-8 BOM. + + high_chars = bytes_to_check.translate(None, _printable_high_ascii) + nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check)) + logger.debug('nontext_ratio2: %(nontext_ratio2)r', locals()) + + is_likely_binary = ( + (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or + (nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8) + ) + logger.debug('is_likely_binary: %(is_likely_binary)r', locals()) + + # then check for binary for possible encoding detection with chardet + detected_encoding = chardet.detect(bytes_to_check) + logger.debug('detected_encoding: %(detected_encoding)r', locals()) + + # finally use all the check to decide binary or text + decodable_as_unicode = False + if (detected_encoding['confidence'] > 0.9 and + detected_encoding['encoding'] != 'ascii'): + try: + try: + bytes_to_check.decode(encoding=detected_encoding['encoding']) + except TypeError: + # happens only on Python 2.6 + unicode(bytes_to_check, encoding=detected_encoding['encoding']) # noqa + decodable_as_unicode = True + logger.debug('success: decodable_as_unicode: ' + '%(decodable_as_unicode)r', locals()) + except LookupError: + logger.debug('failure: could not look up encoding %(encoding)s', + detected_encoding) + except UnicodeDecodeError: + logger.debug('failure: decodable_as_unicode: ' + '%(decodable_as_unicode)r', locals()) + + logger.debug('failure: decodable_as_unicode: ' + '%(decodable_as_unicode)r', locals()) + if is_likely_binary: + if decodable_as_unicode: + return False + else: + return True + else: + if decodable_as_unicode: + return False + else: + if b'\x00' in bytes_to_check or b'\xff' in bytes_to_check: + # Check for NULL bytes last + logger.debug('has nulls:' + repr(b'\x00' in bytes_to_check)) + return True + return False |