'nop', self::OP_EQ => 'eq', self::OP_DEL => 'del', self::OP_INS => 'ins', self::OP_REP => 'rep', ]; const OP_STR_TO_INT_MAP = [ 'nop' => self::OP_NOP, 'eq' => self::OP_EQ, 'del' => self::OP_DEL, 'ins' => self::OP_INS, 'rep' => self::OP_REP, ]; /** * The helper line which may be used to append to the source inputs to help * it easier to handle EOL at EOF problem. This line shouldn't be counted into diff. * * @var string */ const APPENDED_HELPER_LINE = "\u{fcf28}\u{fc232}"; /** * @var null|callable either a string or an array containing a callback function to determine if a line is "junk" or not */ private $junkCallback; /** * @var array the first sequence to compare against */ private $a = []; /** * @var array the second sequence */ private $b = []; /** * @var array the first sequence to compare against (transformed) */ private $at = []; /** * @var array the second sequence (transformed) */ private $bt = []; /** * @var array array of characters that are considered junk from the second sequence. Characters are the array key. */ private $junkDict = []; /** * @var array array of indices that do not contain junk elements */ private $b2j = []; /** * @var array */ private $options = []; /** * @var array */ private static $defaultOptions = [ 'ignoreWhitespace' => false, 'ignoreCase' => false, ]; /** * @var array */ private $matchingBlocks = []; /** * @var array generated opcodes which manipulates seq1 to seq2 */ private $opcodes = []; /** * The constructor. With the sequences being passed, they'll be set * for the sequence matcher and it will perform a basic cleanup & * calculate junk elements. * * @param string[] $a an array containing the lines to compare against * @param string[] $b an array containing the lines to compare * @param null|callable $junkCallback either an array or string that references a callback function (if there is one) to determine 'junk' characters * @param array $options the options */ public function __construct(array $a, array $b, ?callable $junkCallback = null, array $options = []) { $this->junkCallback = $junkCallback; $this->setOptions($options); $this->setSequences($a, $b); } /** * Set the options. * * @param array $options The options */ public function setOptions(array $options): self { $needRerunChainB = $this->isAnyOptionChanged($this->options, $options, ['ignoreCase', 'ignoreWhitespace']); $this->options = $options + self::$defaultOptions; if ($needRerunChainB) { $this->chainB(); } $this->resetCachedResults(); return $this; } /** * Get the options. */ public function getOptions(): array { return $this->options; } /** * Reset cached results. */ public function resetCachedResults(): self { $this->matchingBlocks = []; $this->opcodes = []; return $this; } /** * Set the first and second sequences to use with the sequence matcher. * * This method is more effecient than "->setSeq1($old)->setSeq2($new)" * because it only run the routine once. * * @param string[] $a an array containing the lines to compare against * @param string[] $b an array containing the lines to compare */ public function setSequences(array $a, array $b): self { $need_routine = false; if ($this->a !== $a) { $need_routine = true; $this->a = $a; } if ($this->b !== $b) { $need_routine = true; $this->b = $b; } if ($need_routine) { $this->chainB(); $this->resetCachedResults(); } return $this; } /** * Set the first sequence ($a) and reset any internal caches to indicate that * when calling the calculation methods, we need to recalculate them. * * @param string[] $a the sequence to set as the first sequence */ public function setSeq1(array $a): self { if ($this->a !== $a) { $this->a = $a; $this->chainB(); $this->resetCachedResults(); } return $this; } /** * Set the second sequence ($b) and reset any internal caches to indicate that * when calling the calculation methods, we need to recalculate them. * * @param string[] $b the sequence to set as the second sequence */ public function setSeq2(array $b): self { if ($this->b !== $b) { $this->b = $b; $this->chainB(); $this->resetCachedResults(); } return $this; } /** * Find the longest matching block in the two sequences, as defined by the * lower and upper constraints for each sequence. (for the first sequence, * $alo - $ahi and for the second sequence, $blo - $bhi). * * Essentially, of all of the maximal matching blocks, return the one that * startest earliest in $a, and all of those maximal matching blocks that * start earliest in $a, return the one that starts earliest in $b. * * If the junk callback is defined, do the above but with the restriction * that the junk element appears in the block. Extend it as far as possible * by matching only junk elements in both $a and $b. * * @param int $alo the lower constraint for the first sequence * @param int $ahi the upper constraint for the first sequence * @param int $blo the lower constraint for the second sequence * @param int $bhi the upper constraint for the second sequence * * @return int[] an array containing the longest match that includes the starting position in $a, start in $b and the length/size */ public function findLongestMatch(int $alo, int $ahi, int $blo, int $bhi): array { $bestI = $alo; $bestJ = $blo; $bestSize = 0; $j2Len = []; for ($i = $alo; $i < $ahi; ++$i) { if (null === ($element = $this->at[$i] ?? null)) { continue; } $newJ2Len = []; $jDict = $this->b2j[$element] ?? []; foreach ($jDict as $j) { if ($j < $blo) { continue; } if ($j >= $bhi) { break; } $k = ($j2Len[$j - 1] ?? 0) + 1; $newJ2Len[$j] = $k; if ($k > $bestSize) { $bestI = $i - $k + 1; $bestJ = $j - $k + 1; $bestSize = $k; } } $j2Len = $newJ2Len; } while ( $bestI > $alo && $bestJ > $blo && $this->at[$bestI - 1] === $this->bt[$bestJ - 1] && !$this->isBJunk($this->bt[$bestJ - 1]) ) { --$bestI; --$bestJ; ++$bestSize; } while ( $bestI + $bestSize < $ahi && $bestJ + $bestSize < $bhi && $this->at[$bestI + $bestSize] === $this->bt[$bestJ + $bestSize] && !$this->isBJunk($this->bt[$bestJ + $bestSize]) ) { ++$bestSize; } while ( $bestI > $alo && $bestJ > $blo && $this->at[$bestI - 1] === $this->bt[$bestJ - 1] && $this->isBJunk($this->bt[$bestJ - 1]) ) { --$bestI; --$bestJ; ++$bestSize; } while ( $bestI + $bestSize < $ahi && $bestJ + $bestSize < $bhi && $this->at[$bestI + $bestSize] === $this->bt[$bestJ + $bestSize] && $this->isBJunk($this->bt[$bestJ + $bestSize]) ) { ++$bestSize; } return [$bestI, $bestJ, $bestSize]; } /** * Return a nested set of arrays for all of the matching sub-sequences * in the strings $a and $b. * * Each block contains the lower constraint of the block in $a, the lower * constraint of the block in $b and finally the number of lines that the * block continues for. * * @return int[][] a nested array of the matching blocks, as described by the function */ public function getMatchingBlocks(): array { if (!empty($this->matchingBlocks)) { return $this->matchingBlocks; } $aCount = \count($this->a); $bCount = \count($this->b); $queue = [ [0, $aCount, 0, $bCount], ]; $matchingBlocks = []; while (!empty($queue)) { [$alo, $ahi, $blo, $bhi] = \array_pop($queue); [$i, $j, $k] = $x = $this->findLongestMatch($alo, $ahi, $blo, $bhi); if ($k) { $matchingBlocks[] = $x; if ($alo < $i && $blo < $j) { $queue[] = [$alo, $i, $blo, $j]; } if ($i + $k < $ahi && $j + $k < $bhi) { $queue[] = [$i + $k, $ahi, $j + $k, $bhi]; } } } \usort($matchingBlocks, function (array $a, array $b): int { $aCount = \count($a); $bCount = \count($b); $min = \min($aCount, $bCount); for ($i = 0; $i < $min; ++$i) { if ($a[$i] !== $b[$i]) { return $a[$i] <=> $b[$i]; } } return $aCount <=> $bCount; }); $i1 = $j1 = $k1 = 0; $nonAdjacent = []; foreach ($matchingBlocks as [$i2, $j2, $k2]) { if ($i1 + $k1 === $i2 && $j1 + $k1 === $j2) { $k1 += $k2; continue; } if ($k1) { $nonAdjacent[] = [$i1, $j1, $k1]; } $i1 = $i2; $j1 = $j2; $k1 = $k2; } if ($k1) { $nonAdjacent[] = [$i1, $j1, $k1]; } $nonAdjacent[] = [$aCount, $bCount, 0]; $this->matchingBlocks = $nonAdjacent; return $this->matchingBlocks; } /** * Return a list of all of the opcodes for the differences between the * two strings. * * The nested array returned contains an array describing the opcode * which includes: * 0 - The type of tag (as described below) for the opcode. * 1 - The beginning line in the first sequence. * 2 - The end line in the first sequence. * 3 - The beginning line in the second sequence. * 4 - The end line in the second sequence. * * The different types of tags include: * replace - The string from $i1 to $i2 in $a should be replaced by * the string in $b from $j1 to $j2. * delete - The string in $a from $i1 to $j2 should be deleted. * insert - The string in $b from $j1 to $j2 should be inserted at * $i1 in $a. * equal - The two strings with the specified ranges are equal. * * @return int[][] array of the opcodes describing the differences between the strings */ public function getOpcodes(): array { if (!empty($this->opcodes)) { return $this->opcodes; } $i = $j = 0; $this->opcodes = []; foreach ($this->getMatchingBlocks() as [$ai, $bj, $size]) { if ($i < $ai && $j < $bj) { $tag = self::OP_REP; } elseif ($i < $ai) { $tag = self::OP_DEL; } elseif ($j < $bj) { $tag = self::OP_INS; } else { $tag = self::OP_NOP; } if ($tag) { $this->opcodes[] = [$tag, $i, $ai, $j, $bj]; } $i = $ai + $size; $j = $bj + $size; if ($size) { $this->opcodes[] = [self::OP_EQ, $ai, $i, $bj, $j]; } } return $this->opcodes; } /** * Return a series of nested arrays containing different groups of generated * opcodes for the differences between the strings with up to $context lines * of surrounding content. * * Essentially what happens here is any big equal blocks of strings are stripped * out, the smaller subsets of changes are then arranged in to their groups. * This means that the sequence matcher and diffs do not need to include the full * content of the different files but can still provide context as to where the * changes are. * * @param int $context the number of lines of context to provide around the groups * * @return int[][][] nested array of all of the grouped opcodes */ public function getGroupedOpcodes(int $context = 3): array { $opcodes = $this->getOpcodes(); if (empty($opcodes)) { $opcodes = [ [self::OP_EQ, 0, 1, 0, 1], ]; } if ($opcodes[0][0] === self::OP_EQ) { // fix the leading sequence which is out of context. $opcodes[0] = [ $opcodes[0][0], \max($opcodes[0][1], $opcodes[0][2] - $context), $opcodes[0][2], \max($opcodes[0][3], $opcodes[0][4] - $context), $opcodes[0][4], ]; } $lastItem = \count($opcodes) - 1; if ($opcodes[$lastItem][0] === self::OP_EQ) { [$tag, $i1, $i2, $j1, $j2] = $opcodes[$lastItem]; // fix the trailing sequence which is out of context. $opcodes[$lastItem] = [ $tag, $i1, \min($i2, $i1 + $context), $j1, \min($j2, $j1 + $context), ]; } $maxRange = $context << 1; $groups = $group = []; foreach ($opcodes as [$tag, $i1, $i2, $j1, $j2]) { if ($tag === self::OP_EQ && $i2 - $i1 > $maxRange) { $group[] = [ $tag, $i1, \min($i2, $i1 + $context), $j1, \min($j2, $j1 + $context), ]; $groups[] = $group; $group = []; $i1 = \max($i1, $i2 - $context); $j1 = \max($j1, $j2 - $context); } $group[] = [$tag, $i1, $i2, $j1, $j2]; } if ( !empty($group) && ( \count($group) !== 1 || $group[0][0] !== self::OP_EQ ) ) { $groups[] = $group; } // there will be at least leading/trailing OP_EQ blocks // if we want really zero-context, we keep only non-equal blocks if ($context <= 0) { $groupsNew = []; foreach ($groups as $group) { $groupNew = []; foreach ($group as $block) { if ($block[0] !== self::OP_EQ) { $groupNew[] = $block; } } if (!empty($groupNew)) { $groupsNew[] = $groupNew; } } return $groupsNew; } return $groups; } /** * Convert an operation code from int into its string form. * * @param int $op the operation code * * @throws \InvalidArgumentException * * @return string the string representation of the operation code */ public static function opIntToStr(int $op): string { if (!isset(self::OP_INT_TO_STR_MAP[$op])) { throw new \InvalidArgumentException("Invalid OP: {$op}"); } return self::OP_INT_TO_STR_MAP[$op]; } /** * Convert an operation code from string into its int form. * * @param string $op the operation code * * @throws \InvalidArgumentException * * @return int the int representation of the operation code */ public static function opStrToInt(string $op): int { if (!isset(self::OP_STR_TO_INT_MAP[$op])) { throw new \InvalidArgumentException("Invalid OP: {$op}"); } return self::OP_STR_TO_INT_MAP[$op]; } /** * Determine if any option under test changed. * * @param array $old the old options * @param array $new the new options * @param array $keys the option keys under test */ private function isAnyOptionChanged(array $old, array $new, array $keys): bool { foreach ($keys as $key) { if (isset($new[$key]) && $new[$key] !== $old[$key]) { return true; } } return false; } /** * Get the processed line with the initialized options. * * @param string $line the line * * @return string the line after being processed */ private function processLineWithOptions(string $line): string { if ($this->options['ignoreWhitespace']) { static $whitespaces = [' ', "\t", "\r", "\n"]; $line = \str_replace($whitespaces, '', $line); } if ($this->options['ignoreCase']) { $line = \strtolower($line); } return $line; } /** * Generate the internal arrays containing the list of junk and non-junk * characters for the second ($b) sequence. */ private function chainB(): self { $this->at = \array_map([$this, 'processLineWithOptions'], $this->a); $this->bt = \array_map([$this, 'processLineWithOptions'], $this->b); $length = \count($this->bt); $this->b2j = []; $popularDict = []; for ($i = 0; $i < $length; ++$i) { $char = $this->bt[$i]; $this->b2j[$char] = $this->b2j[$char] ?? []; if ( $length >= 1000 && \count($this->b2j[$char]) * 100 > $length && $char !== self::APPENDED_HELPER_LINE ) { $popularDict[$char] = 1; unset($this->b2j[$char]); } else { $this->b2j[$char][] = $i; } } // remove leftovers foreach (\array_keys($popularDict) as $char) { unset($this->b2j[$char]); } $this->junkDict = []; if (\is_callable($this->junkCallback)) { foreach (\array_keys($popularDict) as $char) { if (($this->junkCallback)($char)) { $this->junkDict[$char] = 1; unset($popularDict[$char]); } } foreach (\array_keys($this->b2j) as $char) { if (($this->junkCallback)($char)) { $this->junkDict[$char] = 1; unset($this->b2j[$char]); } } } return $this; } /** * Checks if a particular character is in the junk dictionary * for the list of junk characters. * * @return bool $b True if the character is considered junk. False if not. */ private function isBJunk(string $b): bool { return isset($this->junkDict[$b]); } }