src/rgw/rgw-gap-list-comparator


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

#!/usr/bin/awk -f

#
# Version 1
#
# This awk script takes two, similarly sorted lists and outputs
# only the lines which exist in both lists.  The script takes
# three inputs:
#
# ./rgw-gap-list-comparator \
#     -v filetwo=gap-list-B.txt \
#     -v matchout=matched_lines.txt \
#     gap-list-A.txt
#

function usage() {
  print "">>"/dev/stderr"
  print "">>"/dev/stderr"
  print "The idea behind the script is to eliminate false positive hits">>"/dev/stderr"
  print "from the rgw-gap-list tool which are due to upload timing of new">>"/dev/stderr"
  print "objects during the tool's execution.  To use the tool properly,">>"/dev/stderr"
  print "the following process should be followed:">>"/dev/stderr"
  print "">>"/dev/stderr"
  print "">>"/dev/stderr"
  print " 1: Run the 'rgw-gap-list' tool twice">>"/dev/stderr"
  print "">>"/dev/stderr"
  print " 2: Sort the resulting map files:">>"/dev/stderr"
  print "   $ export LC_ALL=C">>"/dev/stderr"
  print "   $ sort gap-list-A.gap > gap-list-A.sorted.gap">>"/dev/stderr"
  print "   $ sort gap-list-B.gap > gap-list.B.sorted.gap">>"/dev/stderr"
  print "   -- Where the A / B in the gap-list file names are the date/time associated with each of the respective 'rgw-gap-list' outputs">>"/dev/stderr"
  print "">>"/dev/stderr"
  print " 3: Run the 'same_lines_only.awk' script over the two files:">>"/dev/stderr"
  print "   $ rm matched_lines.txt">>"/dev/stderr"
  print "   $ ./rgw-gap-list-comparator -v filetwo=gap-list-B.sorted.gap -v matchout=matched_lines.txt gap-list-A.sorted.gap">>"/dev/stderr"
  print "   -- Where the A / B in the gap-list file names are the date/time associated with each of the respective 'rgw-gap-list' outputs">>"/dev/stderr"
  print "">>"/dev/stderr"
  print " The resulting 'matched_lines.txt' will be a high confidence list of impacted objects with little to no false positives.">>"/dev/stderr"
  print "">>"/dev/stderr"
  print "">>"/dev/stderr"
  exit 1
}

function advance_f2() {
  if ((getline f2line<filetwo) <= 0) {
    f2_eof=1
  } else {
    f2_count++
  }
}

function test_lines() {
 if($0==f2line) {
    print $0>>matchout
    lineoutcount++
    advance_f2()
    return 0
  } else if ($0>f2line) {
    return 2
  } else {
    return 1
  }
}

function status_out() {
  printf("%s % 17d\t% 17d\t% 12d\n",get_date_time(),f1_count,f2_count,lineoutcount)>>"/dev/stderr"
}

function get_date_time() {
  dtstr="date +%F\\ %T"
  dtstr | getline mydt
  close(dtstr)
  return mydt
}

BEGIN {
  if(filetwo==""||matchout=="") {
     print "">>"/dev/stderr"
     print "">>"/dev/stderr"
     print "Missing parameter."
     print "">>"/dev/stderr"
     print "">>"/dev/stderr"
     usage()
  }

  f1_count=0
  f2_count=0
  lineoutcount=0
  f2_eof=0
  statusevery=100000
  advance_f2()
  printf("%s File 1 Line Count\tFile 2 Line Count\tPotentially Impacted Objects\n",get_date_time())>>"/dev/stderr"
  status_out()
}


{
  f1_count++
  if(f2_eof==0) {
    if(test_lines()==2) {
      while($0>f2line && f2_eof==0) {
        advance_f2()
      }
      test_lines()
    }
  } else {
    exit 0
  }
  if ((f1_count % statusevery)==0) {
    status_out()
  }
}

END {
  if(f1_count>0) {
    status_out()
  }
}