1 files changed, 223 insertions, 75 deletions
diff --git a/pydyf/__init__.py b/pydyf/__init__.py
index 05dccf6..86d321d 100755
--- a/pydyf/__init__.py
+++ b/pydyf/__init__.py
@@ -3,26 +3,28 @@ A low-level PDF generator.
 
 """
 
+import base64
 import re
 import zlib
 from codecs import BOM_UTF16_BE
+from hashlib import md5
+from math import ceil, log
+from warnings import warn
 
-VERSION = __version__ = '0.1.2'
+VERSION = __version__ = '0.10.0'
 
 
 def _to_bytes(item):
     """Convert item to bytes."""
     if isinstance(item, bytes):
         return item
-    elif isinstance(item, Object):
-        return item.data
     elif isinstance(item, float):
         if item.is_integer():
-            return f'{int(item):d}'.encode('ascii')
+            return str(int(item)).encode('ascii')
         else:
-            return f'{item:f}'.encode('ascii')
-    elif isinstance(item, int):
-        return f'{item:d}'.encode('ascii')
+            return f'{item:f}'.rstrip('0').encode('ascii')
+    elif isinstance(item, Object):
+        return item.data
     return str(item).encode('ascii')
 
 
@@ -42,51 +44,41 @@ class Object:
     @property
     def indirect(self):
         """Indirect representation of an object."""
-        return b'\n'.join((
-            str(self.number).encode() + b' ' +
-            str(self.generation).encode() + b' obj',
-            self.data,
-            b'endobj',
-        ))
+        header = f'{self.number} {self.generation} obj\n'.encode()
+        return header + self.data + b'\nendobj'
 
     @property
     def reference(self):
         """Object identifier."""
-        return (
-            str(self.number).encode() + b' ' +
-            str(self.generation).encode() + b' R')
+        return f'{self.number} {self.generation} R'.encode()
 
     @property
     def data(self):
         """Data contained in the object. Shall be defined in each subclass."""
         raise NotImplementedError()
 
+    @property
+    def compressible(self):
+        """Whether the object can be included in an object stream."""
+        return not self.generation and not isinstance(self, Stream)
 
-class Dictionary(Object, dict):
-    """PDF Dictionary object.
-
-    Inherits from :class:`Object` and Python :obj:`dict`.
 
-    """
+class Dictionary(Object, dict):
+    """PDF Dictionary object."""
     def __init__(self, values=None):
         Object.__init__(self)
         dict.__init__(self, values or {})
 
     @property
     def data(self):
-        result = [b'<<']
-        for key, value in self.items():
-            result.append(b'/' + _to_bytes(key) + b' ' + _to_bytes(value))
-        result.append(b'>>')
-        return b'\n'.join(result)
+        result = [
+            b'/' + _to_bytes(key) + b' ' + _to_bytes(value)
+            for key, value in self.items()]
+        return b'<<' + b''.join(result) + b'>>'
 
 
 class Stream(Object):
-    """PDF Stream object.
-
-    Inherits from :class:`Object`.
-
-    """
+    """PDF Stream object."""
     def __init__(self, stream=None, extra=None, compress=False):
         super().__init__()
         #: Python array of data composing stream.
@@ -96,6 +88,15 @@ class Stream(Object):
         #: Compress the stream data if set to ``True``. Default is ``False``.
         self.compress = compress
 
+    def begin_marked_content(self, tag, property_list=None):
+        """Begin marked-content sequence."""
+        self.stream.append(f'/{tag}')
+        if property_list is None:
+            self.stream.append(b'BMC')
+        else:
+            self.stream.append(property_list)
+            self.stream.append(b'BDC')
+
     def begin_text(self):
         """Begin a text object."""
         self.stream.append(b'BT')
@@ -171,6 +172,10 @@ class Stream(Object):
         """End path without filling or stroking."""
         self.stream.append(b'n')
 
+    def end_marked_content(self):
+        """End marked-content sequence."""
+        self.stream.append(b'EMC')
+
     def end_text(self):
         """End text object."""
         self.stream.append(b'ET')
@@ -199,6 +204,37 @@ class Stream(Object):
         """
         self.stream.append(b'b*' if even_odd else b'b')
 
+    def inline_image(self, width, height, color_space, bpc, raw_data):
+        """Add an inline image.
+
+        :param width: The width of the image.
+        :type width: :obj:`int`
+        :param height: The height of the image.
+        :type height: :obj:`int`
+        :param colorspace: The color space of the image, f.e. RGB, Gray.
+        :type colorspace: :obj:`str`
+        :param bpc: The bits per component. 1 for BW, 8 for grayscale.
+        :type bpc: :obj:`int`
+        :param raw_data: The raw pixel data.
+
+        """
+        data = zlib.compress(raw_data) if self.compress else raw_data
+        a85_data = base64.a85encode(data) + b'~>'
+        self.stream.append(b' '.join((
+            b'BI',
+            b'/W', _to_bytes(width),
+            b'/H', _to_bytes(height),
+            b'/BPC', _to_bytes(bpc),
+            b'/CS',
+            b'/Device' + _to_bytes(color_space),
+            b'/F',
+            b'[/A85 /Fl]' if self.compress else b'/A85',
+            b'/L', _to_bytes(len(a85_data)),
+            b'ID',
+            a85_data,
+            b'EI',
+        )))
+
     def line_to(self, x, y):
         """Add line from current point to point ``(x, y)``."""
         self.stream.append(b' '.join((_to_bytes(x), _to_bytes(y), b'l')))
@@ -207,6 +243,10 @@ class Stream(Object):
         """Begin new subpath by moving current point to ``(x, y)``."""
         self.stream.append(b' '.join((_to_bytes(x), _to_bytes(y), b'm')))
 
+    def move_text_to(self, x, y):
+        """Move text to next line at ``(x, y)`` distance from previous line."""
+        self.stream.append(b' '.join((_to_bytes(x), _to_bytes(y), b'Td')))
+
     def shading(self, name):
         """Paint shape and color shading using shading dictionary ``name``."""
         self.stream.append(b'/' + _to_bytes(name) + b' sh')
@@ -271,6 +311,10 @@ class Stream(Object):
         """Set text rendering mode."""
         self.stream.append(_to_bytes(mode) + b' Tr')
 
+    def set_text_rise(self, height):
+        """Set text rise."""
+        self.stream.append(_to_bytes(height) + b' Ts')
+
     def set_line_cap(self, line_cap):
         """Set line cap style."""
         self.stream.append(_to_bytes(line_cap) + b' J')
@@ -296,9 +340,13 @@ class Stream(Object):
         self.stream.append(b'/' + _to_bytes(state_name) + b' gs')
 
     def show_text(self, text):
-        """Show text."""
+        """Show text strings with individual glyph positioning."""
         self.stream.append(b'[' + _to_bytes(text) + b'] TJ')
 
+    def show_text_string(self, text):
+        """Show single text string."""
+        self.stream.append(String(text).data + b' Tj')
+
     def stroke(self):
         """Stroke path."""
         self.stream.append(b'S')
@@ -355,7 +403,7 @@ class Stream(Object):
         extra = Dictionary(self.extra.copy())
         if self.compress:
             extra['Filter'] = '/FlateDecode'
-            compressobj = zlib.compressobj()
+            compressobj = zlib.compressobj(level=9)
             stream = compressobj.compress(stream)
             stream += compressobj.flush()
         extra['Length'] = len(stream)
@@ -363,11 +411,7 @@ class Stream(Object):
 
 
 class String(Object):
-    """PDF String object.
-
-    Inherits from :class:`Object`.
-
-    """
+    """PDF String object."""
     def __init__(self, string=''):
         super().__init__()
         #: Unicode string.
@@ -388,27 +432,29 @@ class String(Object):
 
 
 class Array(Object, list):
-    """PDF Array object.
-
-    Inherits from :class:`Object` and Python :obj:`list`.
-
-    """
+    """PDF Array object."""
     def __init__(self, array=None):
         Object.__init__(self)
         list.__init__(self, array or [])
 
     @property
     def data(self):
-        result = [b'[']
-        for child in self:
-            result.append(_to_bytes(child))
-        result.append(b']')
-        return b' '.join(result)
+        return b'[' + b' '.join(_to_bytes(child) for child in self) + b']'
 
 
 class PDF:
     """PDF document."""
-    def __init__(self):
+    def __init__(self, version=None, identifier=None):
+        """Create a PDF document."""
+        if version or identifier:  # to be removed in next version
+            warn(
+                "PDF objects don’t take version or identifier during initialization "
+                "anymore. These properties are now stored but ignored, and will be "
+                "removed and rejected in next version of pydyf. Please pass these "
+                "properties to the PDF.write() method instead.", DeprecationWarning)
+        self.version = _to_bytes(version) if version else b'1.7'  # to be removed
+        self.identifier = identifier  # to be removed
+
         #: Python :obj:`list` containing the PDF’s objects.
         self.objects = []
 
@@ -425,7 +471,7 @@ class PDF:
         })
         self.add_object(self.pages)
 
-        #: PDF :class:`Dictionary` containing the PDF’s metadata.
+        #: PDF :class:`Dictionary` containing the PDF’s metadata.
         self.info = Dictionary({})
         self.add_object(self.info)
 
@@ -457,6 +503,12 @@ class PDF:
         object_.number = len(self.objects)
         self.objects.append(object_)
 
+    @property
+    def page_references(self):
+        return tuple(
+            f'{object_number} 0 R'.encode('ascii')
+            for object_number in self.pages['Kids'][::3])
+
     def write_line(self, content, output):
         """Write line to output.
 
@@ -469,40 +521,136 @@ class PDF:
         self.current_position += len(content) + 1
         output.write(content + b'\n')
 
-    def write(self, output):
+    def write(self, output, version=b'1.7', identifier=False, compress=False):
         """Write PDF to output.
 
         :param output: Output stream.
         :type output: binary :term:`file object`
+        :param bytes version: PDF version.
+        :param identifier: PDF file identifier. Default is :obj:`False`
+          to include no identifier, can be set to :obj:`True` to generate an
+          automatic identifier.
+        :type identifier: :obj:`bytes` or :obj:`bool`
+        :param bool compress: whether the PDF uses a compressed object stream.
 
         """
+        # Convert version and identifier to bytes
+        version = _to_bytes(version or b'1.7')  # Force 1.7 when None
+        if identifier not in (False, True, None):
+            identifier = _to_bytes(identifier)
+
         # Write header
-        self.write_line(b'%PDF-1.7', output)
+        self.write_line(b'%PDF-' + version, output)
         self.write_line(b'%\xf0\x9f\x96\xa4', output)
 
-        # Write all non-free PDF objects
-        for object_ in self.objects:
-            if object_.free == 'f':
-                continue
-            object_.offset = self.current_position
-            self.write_line(object_.indirect, output)
-
-        # Write cross reference table
-        self.xref_position = self.current_position
-        self.write_line(b'xref', output)
-        self.write_line(f'0 {len(self.objects)}'.encode(), output)
-        for object_ in self.objects:
-            self.write_line(
-                (f'{object_.offset:010} {object_.generation:05} '
-                 f'{object_.free} ').encode(), output)
-
-        # Write trailer
-        self.write_line(b'trailer', output)
-        self.write_line(b'<<', output)
-        self.write_line(f'/Size {len(self.objects)}'.encode(), output)
-        self.write_line(b'/Root ' + self.catalog.reference, output)
-        self.write_line(b'/Info ' + self.info.reference, output)
-        self.write_line(b'>>', output)
+        if version >= b'1.5' and compress:
+            # Store compressed objects for later and write other ones in PDF
+            compressed_objects = []
+            for object_ in self.objects:
+                if object_.free == 'f':
+                    continue
+                if object_.compressible:
+                    compressed_objects.append(object_)
+                else:
+                    object_.offset = self.current_position
+                    self.write_line(object_.indirect, output)
+
+            # Write compressed objects in object stream
+            stream = [[]]
+            position = 0
+            for i, object_ in enumerate(compressed_objects):
+                data = object_.data
+                stream.append(data)
+                stream[0].append(object_.number)
+                stream[0].append(position)
+                position += len(data) + 1
+            stream[0] = ' '.join(str(i) for i in stream[0])
+            extra = {
+                'Type': '/ObjStm',
+                'N': len(compressed_objects),
+                'First': len(stream[0]) + 1,
+            }
+            object_stream = Stream(stream, extra, compress)
+            object_stream.offset = self.current_position
+            self.add_object(object_stream)
+            self.write_line(object_stream.indirect, output)
+
+            # Write cross-reference stream
+            xref = []
+            dict_index = 0
+            for object_ in self.objects:
+                if object_.compressible:
+                    xref.append((2, object_stream.number, dict_index))
+                    dict_index += 1
+                else:
+                    xref.append((
+                        bool(object_.number), object_.offset,
+                        object_.generation))
+            xref.append((1, self.current_position, 0))
+
+            field2_size = ceil(log(self.current_position + 1, 256))
+            max_generation = max(
+                object_.generation for object_ in self.objects)
+            field3_size = ceil(log(
+                max(max_generation, len(compressed_objects)) + 1, 256))
+            xref_lengths = (1, field2_size, field3_size)
+            xref_stream = b''.join(
+                value.to_bytes(length, 'big')
+                for line in xref for length, value in zip(xref_lengths, line))
+            extra = {
+                'Type': '/XRef',
+                'Index': Array((0, len(self.objects) + 1)),
+                'W': Array(xref_lengths),
+                'Size': len(self.objects) + 1,
+                'Root': self.catalog.reference,
+                'Info': self.info.reference,
+            }
+            if identifier:
+                data = b''.join(
+                    obj.data for obj in self.objects if obj.free != 'f')
+                data_hash = md5(data).hexdigest().encode()
+                if identifier is True:
+                    identifier = data_hash
+                extra['ID'] = Array((
+                    String(identifier).data, String(data_hash).data))
+            dict_stream = Stream([xref_stream], extra, compress)
+            self.xref_position = dict_stream.offset = self.current_position
+            self.add_object(dict_stream)
+            self.write_line(dict_stream.indirect, output)
+        else:
+            # Write all non-free PDF objects
+            for object_ in self.objects:
+                if object_.free == 'f':
+                    continue
+                object_.offset = self.current_position
+                self.write_line(object_.indirect, output)
+
+            # Write cross-reference table
+            self.xref_position = self.current_position
+            self.write_line(b'xref', output)
+            self.write_line(f'0 {len(self.objects)}'.encode(), output)
+            for object_ in self.objects:
+                self.write_line(
+                    (f'{object_.offset:010} {object_.generation:05} '
+                     f'{object_.free} ').encode(), output)
+
+            # Write trailer
+            self.write_line(b'trailer', output)
+            self.write_line(b'<<', output)
+            self.write_line(f'/Size {len(self.objects)}'.encode(), output)
+            self.write_line(b'/Root ' + self.catalog.reference, output)
+            self.write_line(b'/Info ' + self.info.reference, output)
+            if identifier:
+                data = b''.join(
+                    obj.data for obj in self.objects if obj.free != 'f')
+                data_hash = md5(data).hexdigest().encode()
+                if identifier is True:
+                    identifier = data_hash
+                self.write_line(
+                    b'/ID [' + String(identifier).data + b' ' +
+                    String(data_hash).data + b']', output)
+            self.write_line(b'>>', output)
+
         self.write_line(b'startxref', output)
         self.write_line(f'{self.xref_position}'.encode(), output)
         self.write_line(b'%%EOF', output)