vendor/imara-diff/src/myers/preprocess.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

use crate::intern::Token;
use crate::myers::sqrt;
use crate::util::{strip_common_postfix, strip_common_prefix};

pub fn preprocess(
    mut file1: &[Token],
    mut file2: &[Token],
) -> (PreprocessedFile, PreprocessedFile) {
    let common_prefix = strip_common_prefix(&mut file1, &mut file2);
    strip_common_postfix(&mut file1, &mut file2);
    let (hdiff1, hdiff2) = token_occurrences(file1, file2);
    let file1 = PreprocessedFile::new(common_prefix, &hdiff1, file1);
    let file2 = PreprocessedFile::new(common_prefix, &hdiff2, file2);
    (file1, file2)
}

/// computes how
fn token_occurrences(file1: &[Token], file2: &[Token]) -> (Vec<Occurances>, Vec<Occurances>) {
    const MAX_EQLIMIT: u32 = 1024;

    // compute the limit after which tokens are treated as `Occurances::COMMON`
    let eqlimit1 = sqrt(file1.len()).min(MAX_EQLIMIT);
    let eqlimit2 = sqrt(file2.len()).min(MAX_EQLIMIT);

    // first collect how often each token occurs in a file
    let mut occurances1 = Vec::new();
    for token in file1 {
        let bucket = token.0 as usize;
        if bucket >= occurances1.len() {
            occurances1.resize(bucket + 1, 0u32);
        }
        occurances1[bucket] += 1;
    }

    // do the same thing for
    let mut occurances2 = Vec::new();
    let token_occurances2: Vec<_> = file2
        .iter()
        .map(|token| {
            let bucket = token.0 as usize;
            if bucket >= occurances2.len() {
                occurances2.resize(bucket + 1, 0);
            }
            occurances2[bucket] += 1;
            let occurances1 = *occurances1.get(bucket).unwrap_or(&0);
            Occurances::from_occurances(occurances1, eqlimit2)
        })
        .collect();

    let token_occurances1: Vec<_> = file1
        .iter()
        .map(|token| {
            let bucket = token.0 as usize;
            let occurances2 = *occurances2.get(bucket).unwrap_or(&0);
            Occurances::from_occurances(occurances2, eqlimit1)
        })
        .collect();

    (token_occurances1, token_occurances2)
}

#[derive(Clone, Copy, Debug)]
enum Occurances {
    /// Token does not occur in this file
    None,
    /// Token occurs at least once
    Some,
    /// Token occurs very frequently (exact number depends on file size).
    /// Such a tokens are usually empty lines or braces and are often not meaningful to a diff
    Common,
}

impl Occurances {
    pub fn from_occurances(occurances: u32, eqlimit: u32) -> Occurances {
        if occurances == 0 {
            Occurances::None
        } else if occurances >= eqlimit {
            Occurances::Common
        } else {
            Occurances::Some
        }
    }
}

#[derive(Debug)]
pub struct PreprocessedFile {
    pub offset: u32,
    pub is_changed: Vec<bool>,
    pub indices: Vec<u32>,
    pub tokens: Vec<Token>,
}

impl PreprocessedFile {
    fn new(offset: u32, token_diff: &[Occurances], tokens: &[Token]) -> PreprocessedFile {
        let mut changed = vec![false; tokens.len()];
        let (tokens, indices) = prune_unmatched_tokens(tokens, token_diff, &mut changed);
        PreprocessedFile { offset, is_changed: changed, indices, tokens }
    }
}

fn prune_unmatched_tokens(
    file: &[Token],
    token_status: &[Occurances],
    changed: &mut [bool],
) -> (Vec<Token>, Vec<u32>) {
    assert_eq!(token_status.len(), file.len());
    file.iter()
        .zip(token_status)
        .enumerate()
        .filter_map(|(i, (&token, &status))| {
            let prune = match status {
                Occurances::None => true,
                Occurances::Some => false,
                Occurances::Common => should_prune_common_line(token_status, i),
            };
            if prune {
                changed[i] = true;
                None
            } else {
                Some((token, i as u32))
            }
        })
        .unzip()
}

// TODO do not unnecessarily rescan lines
fn should_prune_common_line(token_status: &[Occurances], pos: usize) -> bool {
    const WINDOW_SIZE: usize = 100;

    let mut unmatched_before = 0;
    let mut common_before = 0;

    let start = if pos > WINDOW_SIZE { WINDOW_SIZE } else { 0 };
    for status in token_status[start..pos].iter().rev() {
        match status {
            Occurances::None => {
                unmatched_before += 1;
            }
            Occurances::Common => {
                common_before += 1;
            }
            Occurances::Some => break,
        }
    }

    if unmatched_before == 0 {
        return false;
    }

    let end = token_status.len().min(pos + WINDOW_SIZE);
    let mut unmatched_after = 0;
    let mut common_after = 0;
    for status in token_status[pos..end].iter() {
        match status {
            Occurances::None => {
                unmatched_after += 1;
            }
            Occurances::Common => {
                common_after += 1;
            }
            Occurances::Some => break,
        }
    }

    if unmatched_after == 0 {
        return false;
    }

    let common = common_before + common_after;
    let unmatched = unmatched_before + unmatched_after;

    unmatched > 3 * common
}