Adding upstream version 1.23.upstream/1.23 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 14:23:32 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 14:23:32 +0000
commit: 74b9339658d8ea3690d32c237223e5974b601ed9 (patch)
tree: 98c0d79abbdcf18b8509c29bacee7e75314732e9 /encoder.cc
parent: Initial commit. (diff)
download: lzip-upstream.tar.xz
lzip-upstream.zip
1 files changed, 594 insertions, 0 deletions
diff --git a/encoder.cc b/encoder.cc
new file mode 100644
index 0000000..afb38d2
--- /dev/null
+++ b/encoder.cc
@@ -0,0 +1,594 @@
+/* Lzip - LZMA lossless data compressor
+   Copyright (C) 2008-2022 Antonio Diaz Diaz.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define _FILE_OFFSET_BITS 64
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <stdint.h>
+
+#include "lzip.h"
+#include "encoder_base.h"
+#include "encoder.h"
+
+
+const CRC32 crc32;
+
+
+int LZ_encoder::get_match_pairs( Pair * pairs )
+  {
+  int len_limit = match_len_limit;
+  if( len_limit > available_bytes() )
+    {
+    len_limit = available_bytes();
+    if( len_limit < 4 ) return 0;
+    }
+
+  int maxlen = 3;			// only used if pairs != 0
+  int num_pairs = 0;
+  const int min_pos = ( pos > dictionary_size ) ? pos - dictionary_size : 0;
+  const uint8_t * const data = ptr_to_current_pos();
+
+  unsigned tmp = crc32[data[0]] ^ data[1];
+  const int key2 = tmp & ( num_prev_positions2 - 1 );
+  tmp ^= (unsigned)data[2] << 8;
+  const int key3 = num_prev_positions2 + ( tmp & ( num_prev_positions3 - 1 ) );
+  const int key4 = num_prev_positions23 +
+                   ( ( tmp ^ ( crc32[data[3]] << 5 ) ) & key4_mask );
+
+  if( pairs )
+    {
+    const int np2 = prev_positions[key2];
+    const int np3 = prev_positions[key3];
+    if( np2 > min_pos && buffer[np2-1] == data[0] )
+      {
+      pairs[0].dis = pos - np2;
+      pairs[0].len = maxlen = 2 + ( np2 == np3 );
+      num_pairs = 1;
+      }
+    if( np2 != np3 && np3 > min_pos && buffer[np3-1] == data[0] )
+      {
+      maxlen = 3;
+      pairs[num_pairs++].dis = pos - np3;
+      }
+    if( num_pairs > 0 )
+      {
+      const int delta = pairs[num_pairs-1].dis + 1;
+      while( maxlen < len_limit && data[maxlen-delta] == data[maxlen] )
+        ++maxlen;
+      pairs[num_pairs-1].len = maxlen;
+      if( maxlen < 3 ) maxlen = 3;
+      if( maxlen >= len_limit ) pairs = 0;	// done. now just skip
+      }
+    }
+
+  const int pos1 = pos + 1;
+  prev_positions[key2] = pos1;
+  prev_positions[key3] = pos1;
+  int newpos1 = prev_positions[key4];
+  prev_positions[key4] = pos1;
+
+  int32_t * ptr0 = pos_array + ( cyclic_pos << 1 );
+  int32_t * ptr1 = ptr0 + 1;
+  int len = 0, len0 = 0, len1 = 0;
+
+  for( int count = cycles; ; )
+    {
+    if( newpos1 <= min_pos || --count < 0 ) { *ptr0 = *ptr1 = 0; break; }
+
+    const int delta = pos1 - newpos1;
+    int32_t * const newptr = pos_array +
+      ( ( cyclic_pos - delta +
+          ( ( cyclic_pos >= delta ) ? 0 : dictionary_size + 1 ) ) << 1 );
+    if( data[len-delta] == data[len] )
+      {
+      while( ++len < len_limit && data[len-delta] == data[len] ) {}
+      if( pairs && maxlen < len )
+        {
+        pairs[num_pairs].dis = delta - 1;
+        pairs[num_pairs].len = maxlen = len;
+        ++num_pairs;
+        }
+      if( len >= len_limit )
+        {
+        *ptr0 = newptr[0];
+        *ptr1 = newptr[1];
+        break;
+        }
+      }
+    if( data[len-delta] < data[len] )
+      {
+      *ptr0 = newpos1;
+      ptr0 = newptr + 1;
+      newpos1 = *ptr0;
+      len0 = len; if( len1 < len ) len = len1;
+      }
+    else
+      {
+      *ptr1 = newpos1;
+      ptr1 = newptr;
+      newpos1 = *ptr1;
+      len1 = len; if( len0 < len ) len = len0;
+      }
+    }
+  return num_pairs;
+  }
+
+
+void LZ_encoder::update_distance_prices()
+  {
+  for( int dis = start_dis_model; dis < modeled_distances; ++dis )
+    {
+    const int dis_slot = dis_slots[dis];
+    const int direct_bits = ( dis_slot >> 1 ) - 1;
+    const int base = ( 2 | ( dis_slot & 1 ) ) << direct_bits;
+    const int price = price_symbol_reversed( bm_dis + ( base - dis_slot ),
+                                             dis - base, direct_bits );
+    for( int len_state = 0; len_state < len_states; ++len_state )
+      dis_prices[len_state][dis] = price;
+    }
+
+  for( int len_state = 0; len_state < len_states; ++len_state )
+    {
+    int * const dsp = dis_slot_prices[len_state];
+    const Bit_model * const bmds = bm_dis_slot[len_state];
+    int slot = 0;
+    for( ; slot < end_dis_model; ++slot )
+      dsp[slot] = price_symbol6( bmds, slot );
+    for( ; slot < num_dis_slots; ++slot )
+      dsp[slot] = price_symbol6( bmds, slot ) +
+                  (((( slot >> 1 ) - 1 ) - dis_align_bits ) << price_shift_bits );
+
+    int * const dp = dis_prices[len_state];
+    int dis = 0;
+    for( ; dis < start_dis_model; ++dis )
+      dp[dis] = dsp[dis];
+    for( ; dis < modeled_distances; ++dis )
+      dp[dis] += dsp[dis_slots[dis]];
+    }
+  }
+
+
+/* Return the number of bytes advanced (ahead).
+   trials[0]..trials[ahead-1] contain the steps to encode.
+   ( trials[0].dis4 == -1 ) means literal.
+   A match/rep longer or equal than match_len_limit finishes the sequence.
+*/
+int LZ_encoder::sequence_optimizer( const int reps[num_rep_distances],
+                                    const State state )
+  {
+  int num_pairs, num_trials;
+
+  if( pending_num_pairs > 0 )			// from previous call
+    {
+    num_pairs = pending_num_pairs;
+    pending_num_pairs = 0;
+    }
+  else
+    num_pairs = read_match_distances();
+  const int main_len = ( num_pairs > 0 ) ? pairs[num_pairs-1].len : 0;
+
+  int replens[num_rep_distances];
+  int rep_index = 0;
+  for( int i = 0; i < num_rep_distances; ++i )
+    {
+    replens[i] = true_match_len( 0, reps[i] + 1 );
+    if( replens[i] > replens[rep_index] ) rep_index = i;
+    }
+  if( replens[rep_index] >= match_len_limit )
+    {
+    trials[0].price = replens[rep_index];
+    trials[0].dis4 = rep_index;
+    move_and_update( replens[rep_index] );
+    return replens[rep_index];
+    }
+
+  if( main_len >= match_len_limit )
+    {
+    trials[0].price = main_len;
+    trials[0].dis4 = pairs[num_pairs-1].dis + num_rep_distances;
+    move_and_update( main_len );
+    return main_len;
+    }
+
+  const int pos_state = data_position() & pos_state_mask;
+  const uint8_t prev_byte = peek( 1 );
+  const uint8_t cur_byte = peek( 0 );
+  const uint8_t match_byte = peek( reps[0] + 1 );
+
+  trials[1].price = price0( bm_match[state()][pos_state] );
+  if( state.is_char() )
+    trials[1].price += price_literal( prev_byte, cur_byte );
+  else
+    trials[1].price += price_matched( prev_byte, cur_byte, match_byte );
+  trials[1].dis4 = -1;					// literal
+
+  const int match_price = price1( bm_match[state()][pos_state] );
+  const int rep_match_price = match_price + price1( bm_rep[state()] );
+
+  if( match_byte == cur_byte )
+    trials[1].update( rep_match_price + price_shortrep( state, pos_state ), 0, 0 );
+
+  num_trials = std::max( main_len, replens[rep_index] );
+
+  if( num_trials < min_match_len )
+    {
+    trials[0].price = 1;
+    trials[0].dis4 = trials[1].dis4;
+    move_pos();
+    return 1;
+    }
+
+  trials[0].state = state;
+  for( int i = 0; i < num_rep_distances; ++i )
+    trials[0].reps[i] = reps[i];
+
+  for( int len = min_match_len; len <= num_trials; ++len )
+    trials[len].price = infinite_price;
+
+  for( int rep = 0; rep < num_rep_distances; ++rep )
+    {
+    if( replens[rep] < min_match_len ) continue;
+    const int price = rep_match_price + price_rep( rep, state, pos_state );
+    for( int len = min_match_len; len <= replens[rep]; ++len )
+      trials[len].update( price + rep_len_prices.price( len, pos_state ),
+                          rep, 0 );
+    }
+
+  if( main_len > replens[0] )
+    {
+    const int normal_match_price = match_price + price0( bm_rep[state()] );
+    int i = 0, len = std::max( replens[0] + 1, (int)min_match_len );
+    while( len > pairs[i].len ) ++i;
+    while( true )
+      {
+      const int dis = pairs[i].dis;
+      trials[len].update( normal_match_price + price_pair( dis, len, pos_state ),
+                          dis + num_rep_distances, 0 );
+      if( ++len > pairs[i].len && ++i >= num_pairs ) break;
+      }
+    }
+
+  int cur = 0;
+  while( true )				// price optimization loop
+    {
+    move_pos();
+    if( ++cur >= num_trials )		// no more initialized trials
+      {
+      backward( cur );
+      return cur;
+      }
+
+    const int num_pairs = read_match_distances();
+    const int newlen = ( num_pairs > 0 ) ? pairs[num_pairs-1].len : 0;
+    if( newlen >= match_len_limit )
+      {
+      pending_num_pairs = num_pairs;
+      backward( cur );
+      return cur;
+      }
+
+    // give final values to current trial
+    Trial & cur_trial = trials[cur];
+    State cur_state;
+    {
+    const int dis4 = cur_trial.dis4;
+    int prev_index = cur_trial.prev_index;
+    const int prev_index2 = cur_trial.prev_index2;
+
+    if( prev_index2 == single_step_trial )
+      {
+      cur_state = trials[prev_index].state;
+      if( prev_index + 1 == cur )			// len == 1
+        {
+        if( dis4 == 0 ) cur_state.set_short_rep();
+        else cur_state.set_char();			// literal
+        }
+      else if( dis4 < num_rep_distances ) cur_state.set_rep();
+      else cur_state.set_match();
+      }
+    else
+      {
+      if( prev_index2 == dual_step_trial )	// dis4 == 0 (rep0)
+        --prev_index;
+      else					// prev_index2 >= 0
+        prev_index = prev_index2;
+      cur_state.set_char_rep();
+      }
+    cur_trial.state = cur_state;
+    for( int i = 0; i < num_rep_distances; ++i )
+      cur_trial.reps[i] = trials[prev_index].reps[i];
+    mtf_reps( dis4, cur_trial.reps );		// literal is ignored
+    }
+
+    const int pos_state = data_position() & pos_state_mask;
+    const uint8_t prev_byte = peek( 1 );
+    const uint8_t cur_byte = peek( 0 );
+    const uint8_t match_byte = peek( cur_trial.reps[0] + 1 );
+
+    int next_price = cur_trial.price +
+                     price0( bm_match[cur_state()][pos_state] );
+    if( cur_state.is_char() )
+      next_price += price_literal( prev_byte, cur_byte );
+    else
+      next_price += price_matched( prev_byte, cur_byte, match_byte );
+
+    // try last updates to next trial
+    Trial & next_trial = trials[cur+1];
+
+    next_trial.update( next_price, -1, cur );		// literal
+
+    const int match_price = cur_trial.price + price1( bm_match[cur_state()][pos_state] );
+    const int rep_match_price = match_price + price1( bm_rep[cur_state()] );
+
+    if( match_byte == cur_byte && next_trial.dis4 != 0 &&
+        next_trial.prev_index2 == single_step_trial )
+      {
+      const int price = rep_match_price + price_shortrep( cur_state, pos_state );
+      if( price <= next_trial.price )
+        {
+        next_trial.price = price;
+        next_trial.dis4 = 0;				// rep0
+        next_trial.prev_index = cur;
+        }
+      }
+
+    const int triable_bytes =
+      std::min( available_bytes(), max_num_trials - 1 - cur );
+    if( triable_bytes < min_match_len ) continue;
+
+    const int len_limit = std::min( match_len_limit, triable_bytes );
+
+    // try literal + rep0
+    if( match_byte != cur_byte && next_trial.prev_index != cur )
+      {
+      const uint8_t * const data = ptr_to_current_pos();
+      const int dis = cur_trial.reps[0] + 1;
+      const int limit = std::min( match_len_limit + 1, triable_bytes );
+      int len = 1;
+      while( len < limit && data[len-dis] == data[len] ) ++len;
+      if( --len >= min_match_len )
+        {
+        const int pos_state2 = ( pos_state + 1 ) & pos_state_mask;
+        State state2 = cur_state; state2.set_char();
+        const int price = next_price +
+                          price1( bm_match[state2()][pos_state2] ) +
+                          price1( bm_rep[state2()] ) +
+                          price_rep0_len( len, state2, pos_state2 );
+        while( num_trials < cur + 1 + len )
+          trials[++num_trials].price = infinite_price;
+        trials[cur+1+len].update2( price, cur + 1 );
+        }
+      }
+
+    int start_len = min_match_len;
+
+    // try rep distances
+    for( int rep = 0; rep < num_rep_distances; ++rep )
+      {
+      const uint8_t * const data = ptr_to_current_pos();
+      const int dis = cur_trial.reps[rep] + 1;
+      int len;
+
+      if( data[0-dis] != data[0] || data[1-dis] != data[1] ) continue;
+      for( len = min_match_len; len < len_limit; ++len )
+        if( data[len-dis] != data[len] ) break;
+      while( num_trials < cur + len )
+        trials[++num_trials].price = infinite_price;
+      int price = rep_match_price + price_rep( rep, cur_state, pos_state );
+      for( int i = min_match_len; i <= len; ++i )
+        trials[cur+i].update( price + rep_len_prices.price( i, pos_state ),
+                              rep, cur );
+
+      if( rep == 0 ) start_len = len + 1;	// discard shorter matches
+
+      // try rep + literal + rep0
+      int len2 = len + 1;
+      const int limit = std::min( match_len_limit + len2, triable_bytes );
+      while( len2 < limit && data[len2-dis] == data[len2] ) ++len2;
+      len2 -= len + 1;
+      if( len2 < min_match_len ) continue;
+
+      int pos_state2 = ( pos_state + len ) & pos_state_mask;
+      State state2 = cur_state; state2.set_rep();
+      price += rep_len_prices.price( len, pos_state ) +
+               price0( bm_match[state2()][pos_state2] ) +
+               price_matched( data[len-1], data[len], data[len-dis] );
+      pos_state2 = ( pos_state2 + 1 ) & pos_state_mask;
+      state2.set_char();
+      price += price1( bm_match[state2()][pos_state2] ) +
+               price1( bm_rep[state2()] ) +
+               price_rep0_len( len2, state2, pos_state2 );
+      while( num_trials < cur + len + 1 + len2 )
+        trials[++num_trials].price = infinite_price;
+      trials[cur+len+1+len2].update3( price, rep, cur + len + 1, cur );
+      }
+
+    // try matches
+    if( newlen >= start_len && newlen <= len_limit )
+      {
+      const int normal_match_price = match_price +
+                                     price0( bm_rep[cur_state()] );
+
+      while( num_trials < cur + newlen )
+        trials[++num_trials].price = infinite_price;
+
+      int i = 0;
+      while( pairs[i].len < start_len ) ++i;
+      int dis = pairs[i].dis;
+      for( int len = start_len; ; ++len )
+        {
+        int price = normal_match_price + price_pair( dis, len, pos_state );
+        trials[cur+len].update( price, dis + num_rep_distances, cur );
+
+        // try match + literal + rep0
+        if( len == pairs[i].len )
+          {
+          const uint8_t * const data = ptr_to_current_pos();
+          const int dis2 = dis + 1;
+          int len2 = len + 1;
+          const int limit = std::min( match_len_limit + len2, triable_bytes );
+          while( len2 < limit && data[len2-dis2] == data[len2] ) ++len2;
+          len2 -= len + 1;
+          if( len2 >= min_match_len )
+            {
+            int pos_state2 = ( pos_state + len ) & pos_state_mask;
+            State state2 = cur_state; state2.set_match();
+            price += price0( bm_match[state2()][pos_state2] ) +
+                     price_matched( data[len-1], data[len], data[len-dis2] );
+            pos_state2 = ( pos_state2 + 1 ) & pos_state_mask;
+            state2.set_char();
+            price += price1( bm_match[state2()][pos_state2] ) +
+                     price1( bm_rep[state2()] ) +
+                     price_rep0_len( len2, state2, pos_state2 );
+
+            while( num_trials < cur + len + 1 + len2 )
+              trials[++num_trials].price = infinite_price;
+            trials[cur+len+1+len2].update3( price, dis + num_rep_distances,
+                                            cur + len + 1, cur );
+            }
+          if( ++i >= num_pairs ) break;
+          dis = pairs[i].dis;
+          }
+        }
+      }
+    }
+  }
+
+
+bool LZ_encoder::encode_member( const unsigned long long member_size )
+  {
+  const unsigned long long member_size_limit =
+    member_size - Lzip_trailer::size - max_marker_size;
+  const bool best = ( match_len_limit > 12 );
+  const int dis_price_count = best ? 1 : 512;
+  const int align_price_count = best ? 1 : dis_align_size;
+  const int price_count = ( match_len_limit > 36 ) ? 1013 : 4093;
+  int price_counter = 0;		// counters may decrement below 0
+  int dis_price_counter = 0;
+  int align_price_counter = 0;
+  int reps[num_rep_distances];
+  State state;
+  for( int i = 0; i < num_rep_distances; ++i ) reps[i] = 0;
+
+  if( data_position() != 0 || renc.member_position() != Lzip_header::size )
+    return false;				// can be called only once
+
+  if( !data_finished() )			// encode first byte
+    {
+    const uint8_t prev_byte = 0;
+    const uint8_t cur_byte = peek( 0 );
+    renc.encode_bit( bm_match[state()][0], 0 );
+    encode_literal( prev_byte, cur_byte );
+    crc32.update_byte( crc_, cur_byte );
+    get_match_pairs();
+    move_pos();
+    }
+
+  while( !data_finished() )
+    {
+    if( price_counter <= 0 && pending_num_pairs == 0 )
+      {
+      price_counter = price_count;	// recalculate prices every these bytes
+      if( dis_price_counter <= 0 )
+        { dis_price_counter = dis_price_count; update_distance_prices(); }
+      if( align_price_counter <= 0 )
+        {
+        align_price_counter = align_price_count;
+        for( int i = 0; i < dis_align_size; ++i )
+          align_prices[i] = price_symbol_reversed( bm_align, i, dis_align_bits );
+        }
+      match_len_prices.update_prices();
+      rep_len_prices.update_prices();
+      }
+
+    int ahead = sequence_optimizer( reps, state );
+    price_counter -= ahead;
+
+    for( int i = 0; ahead > 0; )
+      {
+      const int pos_state = ( data_position() - ahead ) & pos_state_mask;
+      const int len = trials[i].price;
+      int dis = trials[i].dis4;
+
+      bool bit = ( dis < 0 );
+      renc.encode_bit( bm_match[state()][pos_state], !bit );
+      if( bit )					// literal byte
+        {
+        const uint8_t prev_byte = peek( ahead + 1 );
+        const uint8_t cur_byte = peek( ahead );
+        crc32.update_byte( crc_, cur_byte );
+        if( state.is_char_set_char() )
+          encode_literal( prev_byte, cur_byte );
+        else
+          {
+          const uint8_t match_byte = peek( ahead + reps[0] + 1 );
+          encode_matched( prev_byte, cur_byte, match_byte );
+          }
+        }
+      else					// match or repeated match
+        {
+        crc32.update_buf( crc_, ptr_to_current_pos() - ahead, len );
+        mtf_reps( dis, reps );
+        bit = ( dis < num_rep_distances );
+        renc.encode_bit( bm_rep[state()], bit );
+        if( bit )				// repeated match
+          {
+          bit = ( dis == 0 );
+          renc.encode_bit( bm_rep0[state()], !bit );
+          if( bit )
+            renc.encode_bit( bm_len[state()][pos_state], len > 1 );
+          else
+            {
+            renc.encode_bit( bm_rep1[state()], dis > 1 );
+            if( dis > 1 )
+              renc.encode_bit( bm_rep2[state()], dis > 2 );
+            }
+          if( len == 1 ) state.set_short_rep();
+          else
+            {
+            renc.encode_len( rep_len_model, len, pos_state );
+            rep_len_prices.decrement_counter( pos_state );
+            state.set_rep();
+            }
+          }
+        else					// match
+          {
+          dis -= num_rep_distances;
+          encode_pair( dis, len, pos_state );
+          if( dis >= modeled_distances ) --align_price_counter;
+          --dis_price_counter;
+          match_len_prices.decrement_counter( pos_state );
+          state.set_match();
+          }
+        }
+      ahead -= len; i += len;
+      if( renc.member_position() >= member_size_limit )
+        {
+        if( !dec_pos( ahead ) ) return false;
+        full_flush( state );
+        return true;
+        }
+      }
+    }
+  full_flush( state );
+  return true;
+  }
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 14:23:32 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 14:23:32 +0000
commit	74b9339658d8ea3690d32c237223e5974b601ed9 (patch)
tree	98c0d79abbdcf18b8509c29bacee7e75314732e9 /encoder.cc
parent	Initial commit. (diff)
download	lzip-upstream.tar.xz lzip-upstream.zip