From fcbf3ce37ca8f90a3e36d524a3274ffc063a40e3 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 14 Apr 2024 15:26:02 +0200 Subject: Adding upstream version 0.10.2+dfsg1. Signed-off-by: Daniel Baumann --- vendor/iio/libmergepdf/tcpdi/tcpdi_parser.php | 1450 +++++++++++++++++++++++++ 1 file changed, 1450 insertions(+) create mode 100644 vendor/iio/libmergepdf/tcpdi/tcpdi_parser.php (limited to 'vendor/iio/libmergepdf/tcpdi/tcpdi_parser.php') diff --git a/vendor/iio/libmergepdf/tcpdi/tcpdi_parser.php b/vendor/iio/libmergepdf/tcpdi/tcpdi_parser.php new file mode 100644 index 0000000..2391934 --- /dev/null +++ b/vendor/iio/libmergepdf/tcpdi/tcpdi_parser.php @@ -0,0 +1,1450 @@ +. +// +// See LICENSE file for more information. +// ------------------------------------------------------------------- +// +// Description : This is a PHP class for parsing PDF documents. +// +//============================================================+ + +/** + * @file + * This is a PHP class for parsing PDF documents.
+ * @author Paul Nicholls + * @author Nicola Asuni + * @version 1.1 + */ + +if (!defined ('PDF_TYPE_NULL')) + define ('PDF_TYPE_NULL', 0); +if (!defined ('PDF_TYPE_NUMERIC')) + define ('PDF_TYPE_NUMERIC', 1); +if (!defined ('PDF_TYPE_TOKEN')) + define ('PDF_TYPE_TOKEN', 2); +if (!defined ('PDF_TYPE_HEX')) + define ('PDF_TYPE_HEX', 3); +if (!defined ('PDF_TYPE_STRING')) + define ('PDF_TYPE_STRING', 4); +if (!defined ('PDF_TYPE_DICTIONARY')) + define ('PDF_TYPE_DICTIONARY', 5); +if (!defined ('PDF_TYPE_ARRAY')) + define ('PDF_TYPE_ARRAY', 6); +if (!defined ('PDF_TYPE_OBJDEC')) + define ('PDF_TYPE_OBJDEC', 7); +if (!defined ('PDF_TYPE_OBJREF')) + define ('PDF_TYPE_OBJREF', 8); +if (!defined ('PDF_TYPE_OBJECT')) + define ('PDF_TYPE_OBJECT', 9); +if (!defined ('PDF_TYPE_STREAM')) + define ('PDF_TYPE_STREAM', 10); +if (!defined ('PDF_TYPE_BOOLEAN')) + define ('PDF_TYPE_BOOLEAN', 11); +if (!defined ('PDF_TYPE_REAL')) + define ('PDF_TYPE_REAL', 12); + +/** + * @class tcpdi_parser + * This is a PHP class for parsing PDF documents.
+ * Based on TCPDF_PARSER, part of the TCPDF project by Nicola Asuni. + * @brief This is a PHP class for parsing PDF documents.. + * @version 1.1 + * @author Paul Nicholls - github.com/pauln + * @author Nicola Asuni - info@tecnick.com + */ +class tcpdi_parser { + /** + * Unique parser ID + * @public + */ + public $uniqueid = ''; + + /** + * Raw content of the PDF document. + * @private + */ + private $pdfdata = ''; + + /** + * XREF data. + * @protected + */ + protected $xref = array(); + + /** + * Object streams. + * @protected + */ + protected $objstreams = array(); + + /** + * Objects in objstreams. + * @protected + */ + protected $objstreamobjs = array(); + + /** + * List of seen XREF data locations. + * @protected + */ + protected $xref_seen_offsets = array(); + + /** + * Array of PDF objects. + * @protected + */ + protected $objects = array(); + + /** + * Array of object offsets. + * @private + */ + private $objoffsets = array(); + + /** + * Class object for decoding filters. + * @private + */ + private $FilterDecoders; + + /** + * Pages + * + * @private array + */ + private $pages; + + /** + * Page count + * @private integer + */ + private $page_count; + + /** + * actual page number + * @private integer + */ + private $pageno; + + /** + * PDF version of the loaded document + * @private string + */ + private $pdfVersion; + + /** + * Available BoxTypes + * + * @public array + */ + public $availableBoxes = array('/MediaBox', '/CropBox', '/BleedBox', '/TrimBox', '/ArtBox'); + +// ----------------------------------------------------------------------------- + + /** + * Parse a PDF document an return an array of objects. + * @param $data (string) PDF data to parse. + * @public + * @since 1.0.000 (2011-05-24) + */ + public function __construct($data, $uniqueid) { + if (empty($data)) { + $this->Error('Empty PDF data.'); + } + $this->uniqueid = $uniqueid; + $this->pdfdata = $data; + // get length + $pdflen = strlen($this->pdfdata); + // initialize class for decoding filters + $this->FilterDecoders = new TCPDF_FILTERS(); + // get xref and trailer data + $this->xref = $this->getXrefData(); + $this->findObjectOffsets(); + // parse all document objects + $this->objects = array(); + /*foreach ($this->xref['xref'] as $obj => $offset) { + if (!isset($this->objects[$obj]) AND ($offset > 0)) { + // decode only objects with positive offset + //$this->objects[$obj] = $this->getIndirectObject($obj, $offset, true); + } + }*/ + $this->getPDFVersion(); + $this->readPages(); + } + + /** + * Clean up when done, to free memory etc + */ + public function cleanUp() { + unset($this->pdfdata); + $this->pdfdata = ''; + unset($this->objstreams); + $this->objstreams = array(); + unset($this->objects); + $this->objects = array(); + unset($this->objstreamobjs); + $this->objstreamobjs = array(); + unset($this->xref); + $this->xref = array(); + unset($this->objoffsets); + $this->objoffsets = array(); + unset($this->pages); + $this->pages = array(); + } + + /** + * Return an array of parsed PDF document objects. + * @return (array) Array of parsed PDF document objects. + * @public + * @since 1.0.000 (2011-06-26) + */ + public function getParsedData() { + return array($this->xref, $this->objects, $this->pages); + } + + /** + * Get PDF-Version + * + * And reset the PDF Version used in FPDI if needed + * @public + */ + public function getPDFVersion() { + preg_match('/\d\.\d/', substr($this->pdfdata, 0, 16), $m); + if (isset($m[0])) + $this->pdfVersion = $m[0]; + return $this->pdfVersion; + } + + /** + * Read all /Page(es) + * + */ + function readPages() { + $params = $this->getObjectVal($this->xref['trailer'][1]['/Root']); + $objref = null; + if ($params && $params[1] && is_array($params[1][1])) { + foreach ($params[1][1] as $k=>$v) { + if ($k == '/Pages') { + $objref = $v; + break; + } + } + } + if ($objref == null || $objref[0] !== PDF_TYPE_OBJREF) { + // Offset not found. + return; + } + + $dict = $this->getObjectVal($objref); + if ($dict[0] == PDF_TYPE_OBJECT && $dict[1][0] == PDF_TYPE_DICTIONARY) { + // Dict wrapped in an object + $dict = $dict[1]; + } + + if ($dict[0] !== PDF_TYPE_DICTIONARY) { + return; + } + + $this->pages = array(); + if (isset($dict[1]['/Kids'])) { + $v = $dict[1]['/Kids']; + if ($v[0] == PDF_TYPE_ARRAY) { + foreach ($v[1] as $ref) { + $page = $this->getObjectVal($ref); + $this->readPage($page); + } + } + } + + $this->page_count = count($this->pages); + } + + /** + * Read a single /Page element, recursing through /Kids if necessary + * + */ + private function readPage($page) { + if (isset($page[1][1]['/Kids'])) { + // Nested pages! + foreach ($page[1][1]['/Kids'][1] as $subref) { + $subpage = $this->getObjectVal($subref); + $this->readPage($subpage); + } + } else { + $this->pages[] = $page; + } + } + + /** + * Get pagecount from sourcefile + * + * @return int + */ + function getPageCount() { + return $this->page_count; + } + + /** + * Get Cross-Reference (xref) table and trailer data from PDF document data. + * @param $offset (int) xref offset (if know). + * @param $xref (array) previous xref array (if any). + * @return Array containing xref and trailer data. + * @protected + * @since 1.0.000 (2011-05-24) + */ + protected function getXrefData($offset=0, $xref=array()) { + if ($offset == 0) { + // find last startxref + if (preg_match('/.*[\r\n]startxref[\s\r\n]+([0-9]+)[\s\r\n]+%%EOF/is', $this->pdfdata, $matches) == 0) { + $this->Error('Unable to find startxref'); + } + $startxref = $matches[1]; + } else { + if (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset)) { + // Cross-Reference Stream object + $startxref = $offset; + } elseif (preg_match('/[\r\n]startxref[\s\r\n]+([0-9]+)[\s\r\n]+%%EOF/i', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset)) { + // startxref found + $startxref = $matches[1][0]; + } else { + $this->Error('Unable to find startxref'); + } + } + unset($matches); + + // DOMPDF gets the startxref wrong, giving us the linebreak before the xref starts. + $startxref += strspn($this->pdfdata, "\r\n", $startxref); + + // check xref position + if (strpos($this->pdfdata, 'xref', $startxref) == $startxref) { + // Cross-Reference + $xref = $this->decodeXref($startxref, $xref); + } else { + // Cross-Reference Stream + $xref = $this->decodeXrefStream($startxref, $xref); + } + if (empty($xref)) { + $this->Error('Unable to find xref'); + } + + return $xref; + } + + /** + * Decode the Cross-Reference section + * @param $startxref (int) Offset at which the xref section starts. + * @param $xref (array) Previous xref array (if any). + * @return Array containing xref and trailer data. + * @protected + * @since 1.0.000 (2011-06-20) + */ + protected function decodeXref($startxref, $xref=array()) { + $this->xref_seen_offsets[] = $startxref; + if (!isset($xref['xref_location'])) { + $xref['xref_location'] = $startxref; + $xref['max_object'] = 0; + } + // extract xref data (object indexes and offsets) + $xoffset = $startxref + 5; + // initialize object number + $obj_num = 0; + $offset = $xoffset; + while (preg_match('/^([0-9]+)[\s]([0-9]+)[\s]?([nf]?)/im', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) { + $offset = (strlen($matches[0][0]) + $matches[0][1]); + if ($matches[3][0] == 'n') { + // create unique object index: [object number]_[generation number] + $gen_num = intval($matches[2][0]); + $index = $obj_num.'_'.$gen_num; + // check if object already exist + if (!isset($xref['xref'][$obj_num][$gen_num])) { + // store object offset position + $xref['xref'][$obj_num][$gen_num] = intval($matches[1][0]); + } + ++$obj_num; + $offset += 2; + } elseif ($matches[3][0] == 'f') { + ++$obj_num; + $offset += 2; + } else { + // object number (index) + $obj_num = intval($matches[1][0]); + } + } + unset($matches); + $xref['max_object'] = max($xref['max_object'], $obj_num); + // get trailer data + if (preg_match('/trailer[\s]*<<(.*)>>[\s\r\n]+(?:[%].*[\r\n]+)*startxref[\s\r\n]+/isU', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $xoffset) > 0) { + $trailer_data = $matches[1][0]; + if (!isset($xref['trailer']) OR empty($xref['trailer'])) { + // get only the last updated version + $xref['trailer'] = array(); + $xref['trailer'][0] = PDF_TYPE_DICTIONARY; + $xref['trailer'][1] = array(); + // parse trailer_data + if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { + $xref['trailer'][1]['/Size'] = array(PDF_TYPE_NUMERIC, intval($matches[1])); + } + if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + $xref['trailer'][1]['/Root'] = array(PDF_TYPE_OBJREF, intval($matches[1]), intval($matches[2])); + } + if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + $xref['trailer'][1]['/Encrypt'] = array(PDF_TYPE_OBJREF, intval($matches[1]), intval($matches[2])); + } + if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + $xref['trailer'][1]['/Info'] = array(PDF_TYPE_OBJREF, intval($matches[1]), intval($matches[2])); + } + if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) { + $xref['trailer'][1]['/ID'] = array(PDF_TYPE_ARRAY, array()); + $xref['trailer'][1]['/ID'][1][0] = array(PDF_TYPE_HEX, $matches[1]); + $xref['trailer'][1]['/ID'][1][1] = array(PDF_TYPE_HEX, $matches[2]); + } + } + if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { + // get previous xref + $prevoffset = intval($matches[1]); + if (!in_array($prevoffset, $this->xref_seen_offsets)) { + $this->xref_seen_offsets[] = $prevoffset; + $xref = $this->getXrefData($prevoffset, $xref); + } + } + unset($matches); + } else { + $this->Error('Unable to find trailer'); + } + return $xref; + } + + /** + * Decode the Cross-Reference Stream section + * @param $startxref (int) Offset at which the xref section starts. + * @param $xref (array) Previous xref array (if any). + * @return Array containing xref and trailer data. + * @protected + * @since 1.0.003 (2013-03-16) + */ + protected function decodeXrefStream($startxref, $xref=array()) { + // try to read Cross-Reference Stream + list($xrefobj, $unused) = $this->getRawObject($startxref); + $xrefcrs = $this->getIndirectObject($xrefobj[1], $startxref, true); + if (!isset($xref['xref_location'])) { + $xref['xref_location'] = $startxref; + $xref['max_object'] = 0; + } + if (!isset($xref['xref'])) { + $xref['xref'] = array(); + } + if (!isset($xref['trailer']) OR empty($xref['trailer'])) { + // get only the last updated version + $xref['trailer'] = array(); + $xref['trailer'][0] = PDF_TYPE_DICTIONARY; + $xref['trailer'][1] = array(); + $filltrailer = true; + } else { + $filltrailer = false; + } + $valid_crs = false; + $sarr = $xrefcrs[0][1]; + $keys = array_keys($sarr); + $columns = 1; // Default as per PDF 32000-1:2008. + $predictor = 1; // Default as per PDF 32000-1:2008. + foreach ($keys as $k=>$key) { + $v = $sarr[$key]; + if (($key == '/Type') AND ($v[0] == PDF_TYPE_TOKEN AND ($v[1] == 'XRef'))) { + $valid_crs = true; + } elseif (($key == '/Index') AND ($v[0] == PDF_TYPE_ARRAY AND count($v[1]) >= 2)) { + // first object number in the subsection + $index_first = intval($v[1][0][1]); + // number of entries in the subsection + $index_entries = intval($v[1][1][1]); + } elseif (($key == '/Prev') AND ($v[0] == PDF_TYPE_NUMERIC)) { + // get previous xref offset + $prevxref = intval($v[1]); + } elseif (($key == '/W') AND ($v[0] == PDF_TYPE_ARRAY)) { + // number of bytes (in the decoded stream) of the corresponding field + $wb = array(); + $wb[0] = intval($v[1][0][1]); + $wb[1] = intval($v[1][1][1]); + $wb[2] = intval($v[1][2][1]); + } elseif (($key == '/DecodeParms') AND ($v[0] == PDF_TYPE_DICTIONARY)) { + $decpar = $v[1]; + foreach ($decpar as $kdc => $vdc) { + if (($kdc == '/Columns') AND ($vdc[0] == PDF_TYPE_NUMERIC)) { + $columns = intval($vdc[1]); + } elseif (($kdc == '/Predictor') AND ($vdc[0] == PDF_TYPE_NUMERIC)) { + $predictor = intval($vdc[1]); + } + } + } elseif ($filltrailer) { + switch($key) { + case '/Size': + case '/Root': + case '/Info': + case '/ID': + $xref['trailer'][1][$key] = $v; + break; + default: + break; + } + } + } + // decode data + $obj_num = 0; + if ($valid_crs AND isset($xrefcrs[1][3][0])) { + // number of bytes in a row + $rowlen = ($columns + 1); + // convert the stream into an array of integers + $sdata = unpack('C*', $xrefcrs[1][3][0]); + // split the rows + $sdata = array_chunk($sdata, $rowlen); + // initialize decoded array + $ddata = array(); + // initialize first row with zeros + $prev_row = array_fill (0, $rowlen, 0); + // for each row apply PNG unpredictor + foreach ($sdata as $k => $row) { + // initialize new row + $ddata[$k] = array(); + // get PNG predictor value + if (empty($predictor)) { + $predictor = (10 + $row[0]); + } + // for each byte on the row + for ($i=1; $i<=$columns; ++$i) { + if (!isset($row[$i])) { + // No more data in this row - we're done here. + break; + } + // new index + $j = ($i - 1); + $row_up = $prev_row[$j]; + if ($i == 1) { + $row_left = 0; + $row_upleft = 0; + } else { + $row_left = $row[($i - 1)]; + $row_upleft = $prev_row[($j - 1)]; + } + switch ($predictor) { + case 1: // No prediction (equivalent to PNG None) + case 10: { // PNG prediction (on encoding, PNG None on all rows) + $ddata[$k][$j] = $row[$i]; + break; + } + case 11: { // PNG prediction (on encoding, PNG Sub on all rows) + $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff); + break; + } + case 12: { // PNG prediction (on encoding, PNG Up on all rows) + $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff); + break; + } + case 13: { // PNG prediction (on encoding, PNG Average on all rows) + $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff); + break; + } + case 14: { // PNG prediction (on encoding, PNG Paeth on all rows) + // initial estimate + $p = ($row_left + $row_up - $row_upleft); + // distances + $pa = abs($p - $row_left); + $pb = abs($p - $row_up); + $pc = abs($p - $row_upleft); + $pmin = min($pa, $pb, $pc); + // return minumum distance + switch ($pmin) { + case $pa: { + $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff); + break; + } + case $pb: { + $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff); + break; + } + case $pc: { + $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff); + break; + } + } + break; + } + default: { // PNG prediction (on encoding, PNG optimum) + $this->Error("Unknown PNG predictor $predictor"); + break; + } + } + } + $prev_row = $ddata[$k]; + } // end for each row + // complete decoding + unset($sdata); + $sdata = array(); + // for every row + foreach ($ddata as $k => $row) { + // initialize new row + $sdata[$k] = array(0, 0, 0); + if ($wb[0] == 0) { + // default type field + $sdata[$k][0] = 1; + } + $i = 0; // count bytes on the row + // for every column + for ($c = 0; $c < 3; ++$c) { + // for every byte on the column + for ($b = 0; $b < $wb[$c]; ++$b) { + if (isset($row[$i])) { + $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8)); + } + ++$i; + } + } + } + unset($ddata); + // fill xref + if (isset($index_first)) { + $obj_num = $index_first; + } else { + $obj_num = 0; + } + foreach ($sdata as $k => $row) { + switch ($row[0]) { + case 0: { // (f) linked list of free objects + ++$obj_num; + break; + } + case 1: { // (n) objects that are in use but are not compressed + // create unique object index: [object number]_[generation number] + $index = $obj_num.'_'.$row[2]; + // check if object already exist + if (!isset($xref['xref'][$obj_num][$row[2]])) { + // store object offset position + $xref['xref'][$obj_num][$row[2]] = $row[1]; + } + ++$obj_num; + break; + } + case 2: { // compressed objects + // $row[1] = object number of the object stream in which this object is stored + // $row[2] = index of this object within the object stream + /*$index = $row[1].'_0_'.$row[2]; + $xref['xref'][$row[1]][0][$row[2]] = -1;*/ + break; + } + default: { // null objects + break; + } + } + } + } // end decoding data + $xref['max_object'] = max($xref['max_object'], $obj_num); + if (isset($prevxref)) { + // get previous xref + $xref = $this->getXrefData($prevxref, $xref); + } + return $xref; + } + + /** + * Get raw stream data + * @param $offset (int) Stream offset. + * @param $length (int) Stream length. + * @return string Steam content + * @protected + */ + protected function getRawStream($offset, $length) { + $offset += strspn($this->pdfdata, "\x00\x09\x0a\x0c\x0d\x20", $offset); + $offset += 6; // "stream" + $offset += strspn($this->pdfdata, "\x20", $offset); + $offset += strspn($this->pdfdata, "\r\n", $offset); + + $obj = array(); + $obj[] = PDF_TYPE_STREAM; + $obj[] = substr($this->pdfdata, $offset, $length); + + return array($obj, $offset+$length); + } + + /** + * Get object type, raw value and offset to next object + * @param $offset (int) Object offset. + * @return array containing object type, raw value and offset to next object + * @protected + * @since 1.0.000 (2011-06-20) + */ + protected function getRawObject($offset=0, $data=null) { + if ($data == null) { + $data =& $this->pdfdata; + } + $objtype = ''; // object type to be returned + $objval = ''; // object value to be returned + // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP) + while (strspn($data[$offset], "\x00\x09\x0a\x0c\x0d\x20") == 1) { + $offset++; + } + // get first char + $char = $data[$offset]; + // get object type + switch ($char) { + case '%': { // \x25 PERCENT SIGN + // skip comment and search for next token + $next = strcspn($data, "\r\n", $offset); + if ($next > 0) { + $offset += $next; + list($obj, $unused) = $this->getRawObject($offset, $data); + return $obj; + } + break; + } + case '/': { // \x2F SOLIDUS + // name object + $objtype = PDF_TYPE_TOKEN; + ++$offset; + $length = strcspn($data, "\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset); + $objval = substr($data, $offset, $length); + $offset += $length; + break; + } + case '(': // \x28 LEFT PARENTHESIS + case ')': { // \x29 RIGHT PARENTHESIS + // literal string object + $objtype = PDF_TYPE_STRING; + ++$offset; + $strpos = $offset; + if ($char == '(') { + $open_bracket = 1; + while ($open_bracket > 0) { + if (!isset($data[$strpos])) { + break; + } + $ch = $data[$strpos]; + switch ($ch) { + case '\\': { // REVERSE SOLIDUS (5Ch) (Backslash) + // skip next character + ++$strpos; + break; + } + case '(': { // LEFT PARENHESIS (28h) + ++$open_bracket; + break; + } + case ')': { // RIGHT PARENTHESIS (29h) + --$open_bracket; + break; + } + } + ++$strpos; + } + $objval = substr($data, $offset, ($strpos - $offset - 1)); + $offset = $strpos; + } + break; + } + case '[': // \x5B LEFT SQUARE BRACKET + case ']': { // \x5D RIGHT SQUARE BRACKET + // array object + $objtype = PDF_TYPE_ARRAY; + ++$offset; + if ($char == '[') { + // get array content + $objval = array(); + do { + // get element + list($element, $offset) = $this->getRawObject($offset, $data); + $objval[] = $element; + } while ($element[0] !== ']'); + // remove closing delimiter + array_pop($objval); + } else { + $objtype = ']'; + } + break; + } + case '<': // \x3C LESS-THAN SIGN + case '>': { // \x3E GREATER-THAN SIGN + if (isset($data[($offset + 1)]) AND ($data[($offset + 1)] == $char)) { + // dictionary object + $objtype = PDF_TYPE_DICTIONARY; + if ($char == '<') { + list ($objval, $offset) = $this->getDictValue($offset, $data); + } else { + $objtype = '>>'; + $offset += 2; + } + } else { + // hexadecimal string object + $objtype = PDF_TYPE_HEX; + ++$offset; + // The "Panose" entry in the FontDescriptor Style dict seems to have hex bytes separated by spaces. + if (($char == '<') AND (preg_match('/^([0-9A-Fa-f ]+)[>]/iU', substr($data, $offset), $matches) == 1)) { + $objval = $matches[1]; + $offset += strlen($matches[0]); + unset($matches); + } + } + break; + } + default: { + $frag = $data[$offset] . @$data[$offset+1] . @$data[$offset+2] . @$data[$offset+3]; + switch ($frag) { + case 'endo': + // indirect object + $objtype = 'endobj'; + $offset += 6; + break; + case 'stre': + // Streams should always be indirect objects, and thus processed by getRawStream(). + // If we get here, treat it as a null object as something has gone wrong. + case 'null': + // null object + $objtype = PDF_TYPE_NULL; + $offset += 4; + $objval = 'null'; + break; + case 'true': + // boolean true object + $objtype = PDF_TYPE_BOOLEAN; + $offset += 4; + $objval = true; + break; + case 'fals': + // boolean false object + $objtype = PDF_TYPE_BOOLEAN; + $offset += 5; + $objval = false; + break; + case 'ends': + // end stream object + $objtype = 'endstream'; + $offset += 9; + break; + default: + if (preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+([Robj]{1,3})/i', substr($data, $offset, 33), $matches) == 1) { + if ($matches[3] == 'R') { + // indirect object reference + $objtype = PDF_TYPE_OBJREF; + $offset += strlen($matches[0]); + $objval = array(intval($matches[1]), intval($matches[2])); + } elseif ($matches[3] == 'obj') { + // object start + $objtype = PDF_TYPE_OBJECT; + $objval = intval($matches[1]).'_'.intval($matches[2]); + $offset += strlen ($matches[0]); + } + } elseif (($numlen = strspn($data, '+-.0123456789', $offset)) > 0) { + // numeric object + $objval = substr($data, $offset, $numlen); + $objtype = (intval($objval) != $objval) ? PDF_TYPE_REAL : PDF_TYPE_NUMERIC; + $offset += $numlen; + } + unset($matches); + break; + } + break; + } + } + $obj = array(); + $obj[] = $objtype; + if ($objtype == PDF_TYPE_OBJREF && is_array($objval)) { + foreach ($objval as $val) { + $obj[] = $val; + } + } else { + $obj[] = $objval; + } + return array($obj, $offset); + } + private function getDictValue($offset, &$data) { + $objval = array(); + + // Extract dict from data. + $i=1; + $dict = ''; + $offset += 2; + do { + if ($data[$offset] == '>' && $data[$offset+1] == '>') { + $i--; + $dict .= '>>'; + $offset += 2; + } else if ($data[$offset] == '<' && $data[$offset+1] == '<') { + $i++; + $dict .= '<<'; + $offset += 2; + } else { + $dict .= $data[$offset]; + $offset++; + } + } while ($i>0); + + // Now that we have just the dict, parse it. + $dictoffset = 0; + do { + // Get dict element. + list($key, $eloffset) = $this->getRawObject($dictoffset, $dict); + if ($key[0] == '>>') { + break; + } + list($element, $dictoffset) = $this->getRawObject($eloffset, $dict); + $objval['/'.$key[1]] = $element; + unset($key); + unset($element); + } while (true); + + return array($objval, $offset); + } + + /** + * Get content of indirect object. + * @param $obj_ref (string) Object number and generation number separated by underscore character. + * @param $offset (int) Object offset. + * @param $decoding (boolean) If true decode streams. + * @return array containing object data. + * @protected + * @since 1.0.000 (2011-05-24) + */ + protected function getIndirectObject($obj_ref, $offset=0, $decoding=true) { + $obj = explode('_', $obj_ref); + if (($obj === false) OR (count($obj) != 2)) { + $this->Error('Invalid object reference: '.$obj); + return; + } + $objref = $obj[0].' '.$obj[1].' obj'; + + if (strpos($this->pdfdata, $objref, $offset) != $offset) { + // an indirect reference to an undefined object shall be considered a reference to the null object + return array('null', 'null', $offset); + } + // starting position of object content + $offset += strlen($objref); + // get array of object content + $objdata = array(); + $i = 0; // object main index + do { + if (($i > 0) AND (isset($objdata[($i - 1)][0])) AND ($objdata[($i - 1)][0] == PDF_TYPE_DICTIONARY) AND array_key_exists('/Length', $objdata[($i - 1)][1])) { + // Stream - get using /Length in stream's dict + $lengthobj = $objdata[($i-1)][1]['/Length']; + if ($lengthobj[0] === PDF_TYPE_OBJREF) { + $lengthobj = $this->getObjectVal($lengthobj); + if ($lengthobj[0] === PDF_TYPE_OBJECT) { + $lengthobj = $lengthobj[1]; + } + } + $streamlength = $lengthobj[1]; + list($element, $offset) = $this->getRawStream($offset, $streamlength); + } else { + // get element + list($element, $offset) = $this->getRawObject($offset); + } + // decode stream using stream's dictionary information + if ($decoding AND ($element[0] == PDF_TYPE_STREAM) AND (isset($objdata[($i - 1)][0])) AND ($objdata[($i - 1)][0] == PDF_TYPE_DICTIONARY)) { + $element[3] = $this->decodeStream($objdata[($i - 1)][1], $element[1]); + } + $objdata[$i] = $element; + ++$i; + } while ($element[0] != 'endobj'); + // remove closing delimiter + array_pop($objdata); + // return raw object content + return $objdata; + } + + /** + * Get the content of object, resolving indect object reference if necessary. + * @param $obj (string) Object value. + * @return array containing object data. + * @public + * @since 1.0.000 (2011-06-26) + */ + public function getObjectVal($obj) { + if ($obj[0] == PDF_TYPE_OBJREF) { + if (strpos($obj[1], '_') !== false) { + $key = explode('_', $obj[1]); + } else { + $key = array($obj[1], $obj[2]); + } + + $ret = array(0=>PDF_TYPE_OBJECT, 'obj'=>$key[0], 'gen'=>$key[1]); + + // reference to indirect object + $object = null; + if (isset($this->objects[$key[0]][$key[1]])) { + // this object has been already parsed + $object = $this->objects[$key[0]][$key[1]]; + } elseif (($offset = $this->findObjectOffset($key)) !== false) { + // parse new object + $this->objects[$key[0]][$key[1]] = $this->getIndirectObject($key[0].'_'.$key[1], $offset, false); + $object = $this->objects[$key[0]][$key[1]]; + } elseif (($key[1] == 0) && isset($this->objstreamobjs[$key[0]])) { + // Object is in an object stream + $streaminfo = $this->objstreamobjs[$key[0]]; + $objs = $streaminfo[0]; + if (!isset($this->objstreams[$objs[0]][$objs[1]])) { + // Fetch and decode object stream + $offset = $this->findObjectOffset($objs);; + $objstream = $this->getObjectVal(array(PDF_TYPE_OBJREF, $objs[0], $objs[1])); + $decoded = $this->decodeStream($objstream[1][1], $objstream[2][1]); + $this->objstreams[$objs[0]][$objs[1]] = $decoded[0]; // Store just the data, in case we need more from this objstream + // Free memory + unset($objstream); + unset($decoded); + } + $this->objects[$key[0]][$key[1]] = $this->getRawObject($streaminfo[1], $this->objstreams[$objs[0]][$objs[1]]); + $object = $this->objects[$key[0]][$key[1]]; + } + if (!is_null($object)) { + $ret[1] = $object[0]; + if (isset($object[1][0]) && $object[1][0] == PDF_TYPE_STREAM) { + $ret[0] = PDF_TYPE_STREAM; + $ret[2] = $object[1]; + } + return $ret; + } + } + return $obj; + } + + /** + * Extract object stream to find out what it contains. + * + */ + function extractObjectStream($key) { + $objref = array(PDF_TYPE_OBJREF, $key[0], $key[1]); + $obj = $this->getObjectVal($objref); + if ($obj[0] !== PDF_TYPE_STREAM || !isset($obj[1][1]['/First'][1])) { + // Not a valid object stream dictionary - skip it. + return; + } + $stream = $this->decodeStream($obj[1][1], $obj[2][1]);// Decode object stream, as we need the first bit + $first = intval($obj[1][1]['/First'][1]); + $ints = preg_split('/\s/', substr($stream[0], 0, $first)); // Get list of object / offset pairs + for ($j=1; $jobjstreamobjs[$ints[$j-1]] = array($key, $ints[$j]+$first); + } + } + + // Free memory - we may not need this at all. + unset($obj); + unset($stream); + } + + /** + * Find all object offsets. Saves having to scour the file multiple times. + * @private + */ + private function findObjectOffsets() { + $this->objoffsets = array(); + if (preg_match_all('/(*ANYCRLF)^[\s]*([0-9]+)[\s]+([0-9]+)[\s]+obj/im', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE) >= 1) { + $i = 0; + $laststreamend = 0; + foreach($matches[0] as $match) { + $offset = $match[1] + strspn($match[0], "\x00\x09\x0a\x0c\x0d\x20"); + if ($offset < $laststreamend) { + // Contained within another stream, skip it. + continue; + } + $this->objoffsets[trim($match[0])] = $offset; + $dictoffset = $match[1] + strlen($match[0]); + $dictfrag = substr($this->pdfdata, $dictoffset, 256); + if (preg_match('|^\s+<<[^>]+/Length\s+(\d+)|', $dictfrag, $lengthmatch, PREG_OFFSET_CAPTURE) == 1) { + $laststreamend += intval($lengthmatch[1][0]); + } + if (preg_match('|^\s+<<[^>]+/ObjStm|', $dictfrag, $objstm) == 1) { + $this->extractObjectStream(array($matches[1][$i][0], $matches[2][$i][0])); + } + $i++; + } + } + unset($lengthmatch); + unset($dictfrag); + unset($matches); + } + + /** + * Get offset of an object. Checks xref first, then offsets found by scouring the file. + * @param $key (array) Object key to find (obj, gen). + * @return int Offset of the object in $this->pdfdata. + * @private + */ + private function findObjectOffset($key) { + $objref = $key[0].' '.$key[1].' obj'; + if (isset($this->xref['xref'][$key[0]][$key[1]])) { + $offset = $this->xref['xref'][$key[0]][$key[1]]; + if (strpos($this->pdfdata, $objref, $offset) === $offset) { + // Offset is in xref table and matches actual position in file + //echo "Offset in XREF is correct, returning
"; + return $this->xref['xref'][$key[0]][$key[1]]; + } + } + if (array_key_exists($objref, $this->objoffsets)) { + //echo "Offset found in internal reftable
"; + return $this->objoffsets[$objref]; + } + return false; + } + + /** + * Decode the specified stream. + * @param $sdic (array) Stream's dictionary array. + * @param $stream (string) Stream to decode. + * @return array containing decoded stream data and remaining filters. + * @protected + * @since 1.0.000 (2011-06-22) + */ + protected function decodeStream($sdic, $stream) { + // get stream lenght and filters + $slength = strlen($stream); + if ($slength <= 0) { + return array('', array()); + } + $filters = array(); + foreach ($sdic as $k => $v) { + if ($v[0] == PDF_TYPE_TOKEN) { + if (($k == '/Length') AND ($v[0] == PDF_TYPE_NUMERIC)) { + // get declared stream lenght + $declength = intval($v[1]); + if ($declength < $slength) { + $stream = substr($stream, 0, $declength); + $slength = $declength; + } + } elseif ($k == '/Filter') { + if ($v[0] == PDF_TYPE_TOKEN) { + // single filter + $filters[] = $v[1]; + } elseif ($v[0] == PDF_TYPE_ARRAY) { + // array of filters + foreach ($v[1] as $flt) { + if ($flt[0] == PDF_TYPE_TOKEN) { + $filters[] = $flt[1]; + } + } + } + } + } + } + // decode the stream + $remaining_filters = array(); + foreach ($filters as $filter) { + if (in_array($filter, $this->FilterDecoders->getAvailableFilters())) { + $stream = $this->FilterDecoders->decodeFilter($filter, $stream); + } else { + // add missing filter to array + $remaining_filters[] = $filter; + } + } + return array($stream, $remaining_filters); + } + + + /** + * Set pageno + * + * @param int $pageno Pagenumber to use + */ + public function setPageno($pageno) { + $pageno = ((int) $pageno) - 1; + + if ($pageno < 0 || $pageno >= $this->getPageCount()) { + $this->error("Pagenumber is wrong! (Requested $pageno, max ".$this->getPageCount().")"); + } + + $this->pageno = $pageno; + } + + /** + * Get page-resources from current page + * + * @return array + */ + public function getPageResources() { + return $this->_getPageResources($this->pages[$this->pageno]); + } + + /** + * Get page-resources from /Page + * + * @param array $obj Array of pdf-data + */ + private function _getPageResources ($obj) { // $obj = /Page + $obj = $this->getObjectVal($obj); + + // If the current object has a resources + // dictionary associated with it, we use + // it. Otherwise, we move back to its + // parent object. + if (isset ($obj[1][1]['/Resources'])) { + $res = $obj[1][1]['/Resources']; + if ($res[0] == PDF_TYPE_OBJECT) + return $res[1]; + return $res; + } else { + if (!isset ($obj[1][1]['/Parent'])) { + return false; + } else { + $res = $this->_getPageResources($obj[1][1]['/Parent']); + if ($res[0] == PDF_TYPE_OBJECT) + return $res[1]; + return $res; + } + } + } + + /** + * Get annotations from current page + * + * @return array + */ + public function getPageAnnotations() { + return $this->_getPageAnnotations($this->pages[$this->pageno]); + } + + /** + * Get annotations from /Page + * + * @param array $obj Array of pdf-data + */ + private function _getPageAnnotations ($obj) { // $obj = /Page + $obj = $this->getObjectVal($obj); + + // If the current object has an annotations + // dictionary associated with it, we use + // it. Otherwise, we move back to its + // parent object. + if (isset ($obj[1][1]['/Annots'])) { + $annots = $obj[1][1]['/Annots']; + } else { + if (!isset ($obj[1][1]['/Parent'])) { + return false; + } else { + $annots = $this->_getPageAnnotations($obj[1][1]['/Parent']); + } + } + + if ($annots[0] == PDF_TYPE_OBJREF) + return $this->getObjectVal($annots); + return $annots; + } + + + /** + * Get content of current page + * + * If more /Contents is an array, the streams are concated + * + * @return string + */ + public function getContent() { + $buffer = ''; + + if (isset($this->pages[$this->pageno][1][1]['/Contents'])) { + $contents = $this->_getPageContent($this->pages[$this->pageno][1][1]['/Contents']); + foreach($contents AS $tmp_content) { + $buffer .= $this->_rebuildContentStream($tmp_content) . ' '; + } + } + + return $buffer; + } + + + /** + * Resolve all content-objects + * + * @param array $content_ref + * @return array + */ + private function _getPageContent($content_ref) { + $contents = array(); + + if ($content_ref[0] == PDF_TYPE_OBJREF) { + $content = $this->getObjectVal($content_ref); + if ($content[1][0] == PDF_TYPE_ARRAY) { + $contents = $this->_getPageContent($content[1]); + } else { + $contents[] = $content; + } + } elseif ($content_ref[0] == PDF_TYPE_ARRAY) { + foreach ($content_ref[1] AS $tmp_content_ref) { + $contents = array_merge($contents,$this->_getPageContent($tmp_content_ref)); + } + } + + return $contents; + } + + + /** + * Rebuild content-streams + * + * @param array $obj + * @return string + */ + private function _rebuildContentStream($obj) { + $filters = array(); + + if (isset($obj[1][1]['/Filter'])) { + $_filter = $obj[1][1]['/Filter']; + + if ($_filter[0] == PDF_TYPE_OBJREF) { + $tmpFilter = $this->getObjectVal($_filter); + $_filter = $tmpFilter[1]; + } + + if ($_filter[0] == PDF_TYPE_TOKEN) { + $filters[] = $_filter; + } elseif ($_filter[0] == PDF_TYPE_ARRAY) { + $filters = $_filter[1]; + } + } + + $stream = $obj[2][1]; + + foreach ($filters AS $_filter) { + $stream = $this->FilterDecoders->decodeFilter($_filter[1], $stream); + } + + return $stream; + } + + + /** + * Get a Box from a page + * Arrayformat is same as used by fpdf_tpl + * + * @param array $page a /Page + * @param string $box_index Type of Box @see $availableBoxes + * @param float Scale factor from user space units to points + * @return array + */ + public function getPageBox($page, $box_index, $k) { + $page = $this->getObjectVal($page); + $box = null; + if (isset($page[1][1][$box_index])) + $box =& $page[1][1][$box_index]; + + if (!is_null($box) && $box[0] == PDF_TYPE_OBJREF) { + $tmp_box = $this->getObjectVal($box); + $box = $tmp_box[1]; + } + + if (!is_null($box) && $box[0] == PDF_TYPE_ARRAY) { + $b =& $box[1]; + return array('x' => $b[0][1] / $k, + 'y' => $b[1][1] / $k, + 'w' => abs($b[0][1] - $b[2][1]) / $k, + 'h' => abs($b[1][1] - $b[3][1]) / $k, + 'llx' => min($b[0][1], $b[2][1]) / $k, + 'lly' => min($b[1][1], $b[3][1]) / $k, + 'urx' => max($b[0][1], $b[2][1]) / $k, + 'ury' => max($b[1][1], $b[3][1]) / $k, + ); + } elseif (!isset ($page[1][1]['/Parent'])) { + return false; + } else { + return $this->getPageBox($this->getObjectVal($page[1][1]['/Parent']), $box_index, $k); + } + } + + /** + * Get all page boxes by page no + * + * @param int The page number + * @param float Scale factor from user space units to points + * @return array + */ + public function getPageBoxes($pageno, $k) { + return $this->_getPageBoxes($this->pages[$pageno - 1], $k); + } + + /** + * Get all boxes from /Page + * + * @param array a /Page + * @return array + */ + private function _getPageBoxes($page, $k) { + $boxes = array(); + + foreach($this->availableBoxes AS $box) { + if ($_box = $this->getPageBox($page, $box, $k)) { + $boxes[$box] = $_box; + } + } + + return $boxes; + } + + /** + * Get the page rotation by pageno + * + * @param integer $pageno + * @return array + */ + public function getPageRotation($pageno) { + return $this->_getPageRotation($this->pages[$pageno - 1]); + } + + private function _getPageRotation($obj) { // $obj = /Page + $obj = $this->getObjectVal($obj); + if (isset ($obj[1][1]['/Rotate'])) { + $res = $this->getObjectVal($obj[1][1]['/Rotate']); + if ($res[0] == PDF_TYPE_OBJECT) + return $res[1]; + return $res; + } else { + if (!isset ($obj[1][1]['/Parent'])) { + return false; + } else { + $res = (array)$this->_getPageRotation($obj[1][1]['/Parent']); + if ($res[0] == PDF_TYPE_OBJECT) + return $res[1]; + return $res; + } + } + } + + /** + * This method is automatically called in case of fatal error; it simply outputs the message and halts the execution. + * @param $msg (string) The error message + * @public + * @since 1.0.000 (2011-05-23) + */ + public function Error($msg) { + // exit program and print error + die("TCPDI_PARSER ERROR [{$this->uniqueid}]: ".$msg); + } + +} // END OF TCPDF_PARSER CLASS + +//============================================================+ +// END OF FILE +//============================================================+ -- cgit v1.2.3