diff options
Diffstat (limited to 'pydyf/__init__.py')
-rwxr-xr-x | pydyf/__init__.py | 298 |
1 files changed, 223 insertions, 75 deletions
diff --git a/pydyf/__init__.py b/pydyf/__init__.py index 05dccf6..86d321d 100755 --- a/pydyf/__init__.py +++ b/pydyf/__init__.py @@ -3,26 +3,28 @@ A low-level PDF generator. """ +import base64 import re import zlib from codecs import BOM_UTF16_BE +from hashlib import md5 +from math import ceil, log +from warnings import warn -VERSION = __version__ = '0.1.2' +VERSION = __version__ = '0.10.0' def _to_bytes(item): """Convert item to bytes.""" if isinstance(item, bytes): return item - elif isinstance(item, Object): - return item.data elif isinstance(item, float): if item.is_integer(): - return f'{int(item):d}'.encode('ascii') + return str(int(item)).encode('ascii') else: - return f'{item:f}'.encode('ascii') - elif isinstance(item, int): - return f'{item:d}'.encode('ascii') + return f'{item:f}'.rstrip('0').encode('ascii') + elif isinstance(item, Object): + return item.data return str(item).encode('ascii') @@ -42,51 +44,41 @@ class Object: @property def indirect(self): """Indirect representation of an object.""" - return b'\n'.join(( - str(self.number).encode() + b' ' + - str(self.generation).encode() + b' obj', - self.data, - b'endobj', - )) + header = f'{self.number} {self.generation} obj\n'.encode() + return header + self.data + b'\nendobj' @property def reference(self): """Object identifier.""" - return ( - str(self.number).encode() + b' ' + - str(self.generation).encode() + b' R') + return f'{self.number} {self.generation} R'.encode() @property def data(self): """Data contained in the object. Shall be defined in each subclass.""" raise NotImplementedError() + @property + def compressible(self): + """Whether the object can be included in an object stream.""" + return not self.generation and not isinstance(self, Stream) -class Dictionary(Object, dict): - """PDF Dictionary object. - - Inherits from :class:`Object` and Python :obj:`dict`. - """ +class Dictionary(Object, dict): + """PDF Dictionary object.""" def __init__(self, values=None): Object.__init__(self) dict.__init__(self, values or {}) @property def data(self): - result = [b'<<'] - for key, value in self.items(): - result.append(b'/' + _to_bytes(key) + b' ' + _to_bytes(value)) - result.append(b'>>') - return b'\n'.join(result) + result = [ + b'/' + _to_bytes(key) + b' ' + _to_bytes(value) + for key, value in self.items()] + return b'<<' + b''.join(result) + b'>>' class Stream(Object): - """PDF Stream object. - - Inherits from :class:`Object`. - - """ + """PDF Stream object.""" def __init__(self, stream=None, extra=None, compress=False): super().__init__() #: Python array of data composing stream. @@ -96,6 +88,15 @@ class Stream(Object): #: Compress the stream data if set to ``True``. Default is ``False``. self.compress = compress + def begin_marked_content(self, tag, property_list=None): + """Begin marked-content sequence.""" + self.stream.append(f'/{tag}') + if property_list is None: + self.stream.append(b'BMC') + else: + self.stream.append(property_list) + self.stream.append(b'BDC') + def begin_text(self): """Begin a text object.""" self.stream.append(b'BT') @@ -171,6 +172,10 @@ class Stream(Object): """End path without filling or stroking.""" self.stream.append(b'n') + def end_marked_content(self): + """End marked-content sequence.""" + self.stream.append(b'EMC') + def end_text(self): """End text object.""" self.stream.append(b'ET') @@ -199,6 +204,37 @@ class Stream(Object): """ self.stream.append(b'b*' if even_odd else b'b') + def inline_image(self, width, height, color_space, bpc, raw_data): + """Add an inline image. + + :param width: The width of the image. + :type width: :obj:`int` + :param height: The height of the image. + :type height: :obj:`int` + :param colorspace: The color space of the image, f.e. RGB, Gray. + :type colorspace: :obj:`str` + :param bpc: The bits per component. 1 for BW, 8 for grayscale. + :type bpc: :obj:`int` + :param raw_data: The raw pixel data. + + """ + data = zlib.compress(raw_data) if self.compress else raw_data + a85_data = base64.a85encode(data) + b'~>' + self.stream.append(b' '.join(( + b'BI', + b'/W', _to_bytes(width), + b'/H', _to_bytes(height), + b'/BPC', _to_bytes(bpc), + b'/CS', + b'/Device' + _to_bytes(color_space), + b'/F', + b'[/A85 /Fl]' if self.compress else b'/A85', + b'/L', _to_bytes(len(a85_data)), + b'ID', + a85_data, + b'EI', + ))) + def line_to(self, x, y): """Add line from current point to point ``(x, y)``.""" self.stream.append(b' '.join((_to_bytes(x), _to_bytes(y), b'l'))) @@ -207,6 +243,10 @@ class Stream(Object): """Begin new subpath by moving current point to ``(x, y)``.""" self.stream.append(b' '.join((_to_bytes(x), _to_bytes(y), b'm'))) + def move_text_to(self, x, y): + """Move text to next line at ``(x, y)`` distance from previous line.""" + self.stream.append(b' '.join((_to_bytes(x), _to_bytes(y), b'Td'))) + def shading(self, name): """Paint shape and color shading using shading dictionary ``name``.""" self.stream.append(b'/' + _to_bytes(name) + b' sh') @@ -271,6 +311,10 @@ class Stream(Object): """Set text rendering mode.""" self.stream.append(_to_bytes(mode) + b' Tr') + def set_text_rise(self, height): + """Set text rise.""" + self.stream.append(_to_bytes(height) + b' Ts') + def set_line_cap(self, line_cap): """Set line cap style.""" self.stream.append(_to_bytes(line_cap) + b' J') @@ -296,9 +340,13 @@ class Stream(Object): self.stream.append(b'/' + _to_bytes(state_name) + b' gs') def show_text(self, text): - """Show text.""" + """Show text strings with individual glyph positioning.""" self.stream.append(b'[' + _to_bytes(text) + b'] TJ') + def show_text_string(self, text): + """Show single text string.""" + self.stream.append(String(text).data + b' Tj') + def stroke(self): """Stroke path.""" self.stream.append(b'S') @@ -355,7 +403,7 @@ class Stream(Object): extra = Dictionary(self.extra.copy()) if self.compress: extra['Filter'] = '/FlateDecode' - compressobj = zlib.compressobj() + compressobj = zlib.compressobj(level=9) stream = compressobj.compress(stream) stream += compressobj.flush() extra['Length'] = len(stream) @@ -363,11 +411,7 @@ class Stream(Object): class String(Object): - """PDF String object. - - Inherits from :class:`Object`. - - """ + """PDF String object.""" def __init__(self, string=''): super().__init__() #: Unicode string. @@ -388,27 +432,29 @@ class String(Object): class Array(Object, list): - """PDF Array object. - - Inherits from :class:`Object` and Python :obj:`list`. - - """ + """PDF Array object.""" def __init__(self, array=None): Object.__init__(self) list.__init__(self, array or []) @property def data(self): - result = [b'['] - for child in self: - result.append(_to_bytes(child)) - result.append(b']') - return b' '.join(result) + return b'[' + b' '.join(_to_bytes(child) for child in self) + b']' class PDF: """PDF document.""" - def __init__(self): + def __init__(self, version=None, identifier=None): + """Create a PDF document.""" + if version or identifier: # to be removed in next version + warn( + "PDF objects don’t take version or identifier during initialization " + "anymore. These properties are now stored but ignored, and will be " + "removed and rejected in next version of pydyf. Please pass these " + "properties to the PDF.write() method instead.", DeprecationWarning) + self.version = _to_bytes(version) if version else b'1.7' # to be removed + self.identifier = identifier # to be removed + #: Python :obj:`list` containing the PDF’s objects. self.objects = [] @@ -425,7 +471,7 @@ class PDF: }) self.add_object(self.pages) - #: PDF :class:`Dictionary` containing the PDF’s metadata. + #: PDF :class:`Dictionary` containing the PDF’s metadata. self.info = Dictionary({}) self.add_object(self.info) @@ -457,6 +503,12 @@ class PDF: object_.number = len(self.objects) self.objects.append(object_) + @property + def page_references(self): + return tuple( + f'{object_number} 0 R'.encode('ascii') + for object_number in self.pages['Kids'][::3]) + def write_line(self, content, output): """Write line to output. @@ -469,40 +521,136 @@ class PDF: self.current_position += len(content) + 1 output.write(content + b'\n') - def write(self, output): + def write(self, output, version=b'1.7', identifier=False, compress=False): """Write PDF to output. :param output: Output stream. :type output: binary :term:`file object` + :param bytes version: PDF version. + :param identifier: PDF file identifier. Default is :obj:`False` + to include no identifier, can be set to :obj:`True` to generate an + automatic identifier. + :type identifier: :obj:`bytes` or :obj:`bool` + :param bool compress: whether the PDF uses a compressed object stream. """ + # Convert version and identifier to bytes + version = _to_bytes(version or b'1.7') # Force 1.7 when None + if identifier not in (False, True, None): + identifier = _to_bytes(identifier) + # Write header - self.write_line(b'%PDF-1.7', output) + self.write_line(b'%PDF-' + version, output) self.write_line(b'%\xf0\x9f\x96\xa4', output) - # Write all non-free PDF objects - for object_ in self.objects: - if object_.free == 'f': - continue - object_.offset = self.current_position - self.write_line(object_.indirect, output) - - # Write cross reference table - self.xref_position = self.current_position - self.write_line(b'xref', output) - self.write_line(f'0 {len(self.objects)}'.encode(), output) - for object_ in self.objects: - self.write_line( - (f'{object_.offset:010} {object_.generation:05} ' - f'{object_.free} ').encode(), output) - - # Write trailer - self.write_line(b'trailer', output) - self.write_line(b'<<', output) - self.write_line(f'/Size {len(self.objects)}'.encode(), output) - self.write_line(b'/Root ' + self.catalog.reference, output) - self.write_line(b'/Info ' + self.info.reference, output) - self.write_line(b'>>', output) + if version >= b'1.5' and compress: + # Store compressed objects for later and write other ones in PDF + compressed_objects = [] + for object_ in self.objects: + if object_.free == 'f': + continue + if object_.compressible: + compressed_objects.append(object_) + else: + object_.offset = self.current_position + self.write_line(object_.indirect, output) + + # Write compressed objects in object stream + stream = [[]] + position = 0 + for i, object_ in enumerate(compressed_objects): + data = object_.data + stream.append(data) + stream[0].append(object_.number) + stream[0].append(position) + position += len(data) + 1 + stream[0] = ' '.join(str(i) for i in stream[0]) + extra = { + 'Type': '/ObjStm', + 'N': len(compressed_objects), + 'First': len(stream[0]) + 1, + } + object_stream = Stream(stream, extra, compress) + object_stream.offset = self.current_position + self.add_object(object_stream) + self.write_line(object_stream.indirect, output) + + # Write cross-reference stream + xref = [] + dict_index = 0 + for object_ in self.objects: + if object_.compressible: + xref.append((2, object_stream.number, dict_index)) + dict_index += 1 + else: + xref.append(( + bool(object_.number), object_.offset, + object_.generation)) + xref.append((1, self.current_position, 0)) + + field2_size = ceil(log(self.current_position + 1, 256)) + max_generation = max( + object_.generation for object_ in self.objects) + field3_size = ceil(log( + max(max_generation, len(compressed_objects)) + 1, 256)) + xref_lengths = (1, field2_size, field3_size) + xref_stream = b''.join( + value.to_bytes(length, 'big') + for line in xref for length, value in zip(xref_lengths, line)) + extra = { + 'Type': '/XRef', + 'Index': Array((0, len(self.objects) + 1)), + 'W': Array(xref_lengths), + 'Size': len(self.objects) + 1, + 'Root': self.catalog.reference, + 'Info': self.info.reference, + } + if identifier: + data = b''.join( + obj.data for obj in self.objects if obj.free != 'f') + data_hash = md5(data).hexdigest().encode() + if identifier is True: + identifier = data_hash + extra['ID'] = Array(( + String(identifier).data, String(data_hash).data)) + dict_stream = Stream([xref_stream], extra, compress) + self.xref_position = dict_stream.offset = self.current_position + self.add_object(dict_stream) + self.write_line(dict_stream.indirect, output) + else: + # Write all non-free PDF objects + for object_ in self.objects: + if object_.free == 'f': + continue + object_.offset = self.current_position + self.write_line(object_.indirect, output) + + # Write cross-reference table + self.xref_position = self.current_position + self.write_line(b'xref', output) + self.write_line(f'0 {len(self.objects)}'.encode(), output) + for object_ in self.objects: + self.write_line( + (f'{object_.offset:010} {object_.generation:05} ' + f'{object_.free} ').encode(), output) + + # Write trailer + self.write_line(b'trailer', output) + self.write_line(b'<<', output) + self.write_line(f'/Size {len(self.objects)}'.encode(), output) + self.write_line(b'/Root ' + self.catalog.reference, output) + self.write_line(b'/Info ' + self.info.reference, output) + if identifier: + data = b''.join( + obj.data for obj in self.objects if obj.free != 'f') + data_hash = md5(data).hexdigest().encode() + if identifier is True: + identifier = data_hash + self.write_line( + b'/ID [' + String(identifier).data + b' ' + + String(data_hash).data + b']', output) + self.write_line(b'>>', output) + self.write_line(b'startxref', output) self.write_line(f'{self.xref_position}'.encode(), output) self.write_line(b'%%EOF', output) |