From f0bda7c287ad4012750724d1a836131e6498d9a9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 25 Jan 2021 14:52:00 +0100 Subject: Merging upstream version 1.22. Signed-off-by: Daniel Baumann --- nrep_stats.cc | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 nrep_stats.cc (limited to 'nrep_stats.cc') diff --git a/nrep_stats.cc b/nrep_stats.cc new file mode 100644 index 0000000..2f335e6 --- /dev/null +++ b/nrep_stats.cc @@ -0,0 +1,117 @@ +/* Lziprecover - Data recovery tool for the lzip format + Copyright (C) 2009-2021 Antonio Diaz Diaz. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#define _FILE_OFFSET_BITS 64 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lzip.h" +#include "lzip_index.h" + + +/* Show how well the frequency of sequences of N repeated bytes in LZMA data + matches the value expected for random data. ( 1 / 2^( 8 * N ) ) + Print cumulative data for all files followed by the name of the first + file with the longest sequence. +*/ +int print_nrep_stats( const std::vector< std::string > & filenames, + const int repeated_byte, const bool ignore_errors, + const bool ignore_trailing, const bool loose_trailing ) + { + std::vector< unsigned long > len_vector; + unsigned long long best_pos = 0, lzma_size = 0; + int best_name = -1, retval = 0; + const bool count_all = ( repeated_byte < 0 || repeated_byte >= 256 ); + bool stdin_used = false; + for( unsigned i = 0; i < filenames.size(); ++i ) + { + const bool from_stdin = ( filenames[i] == "-" ); + if( from_stdin ) { if( stdin_used ) continue; else stdin_used = true; } + const char * const input_filename = + from_stdin ? "(stdin)" : filenames[i].c_str(); + struct stat in_stats; // not used + const int infd = from_stdin ? STDIN_FILENO : + open_instream( input_filename, &in_stats, false, true ); + if( infd < 0 ) { set_retval( retval, 1 ); continue; } + + const Lzip_index lzip_index( infd, ignore_trailing, loose_trailing, + ignore_errors, ignore_errors ); + if( lzip_index.retval() != 0 ) + { + show_file_error( input_filename, lzip_index.error().c_str() ); + set_retval( retval, lzip_index.retval() ); + close( infd ); + continue; + } + const unsigned long long cdata_size = lzip_index.cdata_size(); + const uint8_t * const buffer = + (const uint8_t *)mmap( 0, cdata_size, PROT_READ, MAP_PRIVATE, infd, 0 ); + close( infd ); + if( buffer == MAP_FAILED ) + { show_file_error( input_filename, "Can't mmap", errno ); + set_retval( retval, 1 ); continue; } + for( long j = 0; j < lzip_index.members(); ++j ) + { + const Block & mb = lzip_index.mblock( j ); + long long pos = mb.pos() + 7; // skip header (+1 byte) and + const long long end = mb.end() - 20; // trailer of each member + lzma_size += end - pos; + while( pos < end ) + { + const uint8_t byte = buffer[pos++]; + if( buffer[pos] == byte ) + { + unsigned len = 2; + ++pos; + while( pos < end && buffer[pos] == byte ) { ++pos; ++len; } + if( !count_all && repeated_byte != (int)byte ) continue; + if( len >= len_vector.size() ) { len_vector.resize( len + 1 ); + best_name = i; best_pos = pos - len; } + ++len_vector[len]; + } + } + } + munmap( (void *)buffer, cdata_size ); + } + + if( count_all ) + std::fputs( "\nShowing repeated sequences of any byte value.\n", stdout ); + else + std::printf( "\nShowing repeated sequences of the byte value 0x%02X\n", + repeated_byte ); + std::printf( "Total size of LZMA data: %llu bytes (%sBytes)\n", + lzma_size, format_num( lzma_size, 999 ) ); + for( unsigned len = 2; len < len_vector.size(); ++len ) + if( len_vector[len] > 0 ) + std::printf( "len %u found %lu times, 1 every %llu bytes " + "(expected 1 every %sB)\n", + len, len_vector[len], lzma_size / len_vector[len], + format_num( 1ULL << ( 8 * ( len - count_all ) ), -1ULL, -1 ) ); + if( best_name >= 0 ) + std::printf( "Longest sequence found at position %llu of '%s'\n", + best_pos, filenames[best_name].c_str() ); + return retval; + } -- cgit v1.2.3