summaryrefslogtreecommitdiffstats
path: root/nrep_stats.cc
blob: 2f335e6684e84a5039c1e2d5952b330e4ea8a0bb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/* Lziprecover - Data recovery tool for the lzip format
   Copyright (C) 2009-2021 Antonio Diaz Diaz.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#define _FILE_OFFSET_BITS 64

#include <algorithm>
#include <cerrno>
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>
#include <stdint.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>

#include "lzip.h"
#include "lzip_index.h"


/* Show how well the frequency of sequences of N repeated bytes in LZMA data
   matches the value expected for random data. ( 1 / 2^( 8 * N ) )
   Print cumulative data for all files followed by the name of the first
   file with the longest sequence.
*/
int print_nrep_stats( const std::vector< std::string > & filenames,
                      const int repeated_byte, const bool ignore_errors,
                      const bool ignore_trailing, const bool loose_trailing )
  {
  std::vector< unsigned long > len_vector;
  unsigned long long best_pos = 0, lzma_size = 0;
  int best_name = -1, retval = 0;
  const bool count_all = ( repeated_byte < 0 || repeated_byte >= 256 );
  bool stdin_used = false;
  for( unsigned i = 0; i < filenames.size(); ++i )
    {
    const bool from_stdin = ( filenames[i] == "-" );
    if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; }
    const char * const input_filename =
      from_stdin ? "(stdin)" : filenames[i].c_str();
    struct stat in_stats;				// not used
    const int infd = from_stdin ? STDIN_FILENO :
      open_instream( input_filename, &in_stats, false, true );
    if( infd < 0 ) { set_retval( retval, 1 ); continue; }

    const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing,
                                 ignore_errors, ignore_errors );
    if( lzip_index.retval() != 0 )
      {
      show_file_error( input_filename, lzip_index.error().c_str() );
      set_retval( retval, lzip_index.retval() );
      close( infd );
      continue;
      }
    const unsigned long long cdata_size = lzip_index.cdata_size();
    const uint8_t * const buffer =
      (const uint8_t *)mmap( 0, cdata_size, PROT_READ, MAP_PRIVATE, infd, 0 );
    close( infd );
    if( buffer == MAP_FAILED )
      { show_file_error( input_filename, "Can't mmap", errno );
        set_retval( retval, 1 ); continue; }
    for( long j = 0; j < lzip_index.members(); ++j )
      {
      const Block & mb = lzip_index.mblock( j );
      long long pos = mb.pos() + 7;		// skip header (+1 byte) and
      const long long end = mb.end() - 20;	// trailer of each member
      lzma_size += end - pos;
      while( pos < end )
        {
        const uint8_t byte = buffer[pos++];
        if( buffer[pos] == byte )
          {
          unsigned len = 2;
          ++pos;
          while( pos < end && buffer[pos] == byte ) { ++pos; ++len; }
          if( !count_all && repeated_byte != (int)byte ) continue;
          if( len >= len_vector.size() ) { len_vector.resize( len + 1 );
            best_name = i; best_pos = pos - len; }
          ++len_vector[len];
          }
        }
      }
    munmap( (void *)buffer, cdata_size );
    }

  if( count_all )
    std::fputs( "\nShowing repeated sequences of any byte value.\n", stdout );
  else
    std::printf( "\nShowing repeated sequences of the byte value 0x%02X\n",
                 repeated_byte );
  std::printf( "Total size of LZMA data: %llu bytes (%sBytes)\n",
               lzma_size, format_num( lzma_size, 999 ) );
  for( unsigned len = 2; len < len_vector.size(); ++len )
    if( len_vector[len] > 0 )
      std::printf( "len %u found %lu times, 1 every %llu bytes "
                   "(expected 1 every %sB)\n",
                   len, len_vector[len], lzma_size / len_vector[len],
                   format_num( 1ULL << ( 8 * ( len - count_all ) ), -1ULL, -1 ) );
  if( best_name >= 0 )
    std::printf( "Longest sequence found at position %llu of '%s'\n",
                 best_pos, filenames[best_name].c_str() );
  return retval;
  }