summaryrefslogtreecommitdiffstats
path: root/third_party/rust/relevancy/src/url_hash.rs
blob: d31a45d06b44791f0cb4ec078e3afd4a51f09285 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

use md5::{Digest, Md5};
use url::{Host, Url};

pub type UrlHash = [u8; 16];

/// Given a URL, extract the part of it that we want to use to identify it.
///
/// We currently use the final 3 components of the URL domain.
///
/// TODO: decide if this should be 3 or 3 components.
pub fn url_hash_source(url: &str) -> Option<String> {
    let url = Url::parse(url).ok()?;
    let domain = match url.host() {
        Some(Host::Domain(d)) => d,
        _ => return None,
    };
    // This will store indexes of `.` chars as we search backwards.
    let mut pos = domain.len();
    for _ in 0..3 {
        match domain[0..pos].rfind('.') {
            Some(p) => pos = p,
            // The domain has less than 3 dots, return it all
            None => return Some(domain.to_owned()),
        }
    }
    Some(domain[pos + 1..].to_owned())
}

pub fn hash_url(url: &str) -> Option<UrlHash> {
    url_hash_source(url).map(|hash_source| {
        let mut hasher = Md5::new();
        hasher.update(hash_source);
        let result = hasher.finalize();
        result.into()
    })
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_url_hash_source() {
        let table = [
            ("http://example.com/some-path", Some("example.com")),
            ("http://foo.example.com/some-path", Some("foo.example.com")),
            (
                "http://foo.bar.baz.example.com/some-path",
                Some("baz.example.com"),
            ),
            ("http://foo.com.uk/some-path", Some("foo.com.uk")),
            ("http://amazon.com/some-path", Some("amazon.com")),
            ("http://192.168.0.1/some-path", None),
        ];
        for (url, expected) in table {
            assert_eq!(url_hash_source(url).as_deref(), expected)
        }
    }
}