diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/minifier/src/html.rs | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/minifier/src/html.rs')
-rw-r--r-- | vendor/minifier/src/html.rs | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/vendor/minifier/src/html.rs b/vendor/minifier/src/html.rs new file mode 100644 index 000000000..b6fa99362 --- /dev/null +++ b/vendor/minifier/src/html.rs @@ -0,0 +1,233 @@ +// Take a look at the license at the top of the repository in the LICENSE file. + +use regex::{Captures, Regex}; + +fn condense_whitespace(source: &str) -> String { + let lower_source = source.to_lowercase(); + if lower_source.find("<textarea").is_none() && lower_source.find("<pre").is_none() { + // maybe should be better not to recreate Regex every time? + let re = Regex::new(r">\s+<").unwrap(); + let source = re.replace_all(source, "> <").into_owned(); + let re = Regex::new(r"\s{2,}|[\r\n]").unwrap(); + re.replace_all(&source, " ").into_owned() + } else { + source.trim().to_owned() + } +} + +fn condense(source: &str) -> String { + let re = Regex::new(r"<(style|script)[\w|\s].*?>").unwrap(); + let type_reg = Regex::new(r#"\s*?type="[\w|\s].*?""#).unwrap(); + re.replace_all(source, |caps: &Captures| { + type_reg.replace_all(&caps[0], "").into_owned() + }) + .into_owned() +} + +fn clean_unneeded_tags(source: &str) -> String { + let useless_tags = [ + "</area>", + "</base>", + "<body>", + "</body>", + "</br>", + "</col>", + "</colgroup>", + "</dd>", + "</dt>", + "<head>", + "</head>", + "</hr>", + "<html>", + "</html>", + "</img>", + "</input>", + "</li>", + "</link>", + "</meta>", + "</option>", + "</param>", + "<tbody>", + "</tbody>", + "</td>", + "</tfoot>", + "</th>", + "</thead>", + "</tr>", + "</basefont>", + "</isindex>", + "</param>", + ]; + let mut res = source.to_owned(); + for useless_tag in &useless_tags { + res = res.replace(useless_tag, ""); + } + res +} + +fn remove_comments(source: &str) -> String { + // "build" and "endbuild" should be matched case insensitively. + let re = Regex::new("<!--(.|\n)*?-->").unwrap(); + re.replace_all(source, |caps: &Captures| { + if caps[0].replace("<!--", " ").trim().starts_with("[") { + caps[0].to_owned() + } else { + " ".to_owned() + } + }) + .into_owned() +} + +fn unquote_attributes(source: &str) -> String { + // Some attributes like width, height, etc... don't need quotes. + let any_tag = Regex::new(r"<\w.*?>").unwrap(); + let extra_spaces = Regex::new(r" \s+|\s +").unwrap(); + let between_words = Regex::new(r"\w\s+\w").unwrap(); + let spaces_before_close = Regex::new(r##""\s+>"##).unwrap(); + let spaces_before_close2 = Regex::new(r"'\s+>").unwrap(); + let extra_spaces2 = Regex::new(r##""\s\s+\w+="|'\s\s+\w+='|"\s\s+\w+=|'\s\s+\w+="##).unwrap(); + let extra_spaces3 = Regex::new(r"\d\s+>").unwrap(); + let quotes_in_tag = Regex::new(r##"([a-zA-Z]+)="([a-zA-Z0-9-_\.]+)""##).unwrap(); + + any_tag + .replace_all(source, |caps: &Captures| { + let cap = format!("{}", &caps[0]); + if cap.starts_with("<!") || cap.find("</").is_some() { + cap + } else { + let tag = spaces_before_close.replace_all(&cap, "\">").into_owned(); + let mut tag = spaces_before_close2.replace_all(&tag, "'>").into_owned(); + let tag_c = tag.clone(); + + let space1_matches: Vec<_> = between_words.find_iter(&tag_c).collect(); + let space6_matches: Vec<_> = extra_spaces3.find_iter(&tag_c).collect(); + let mut pos = 0; + loop { + let replacement = match (space1_matches.get(pos), space6_matches.get(pos)) { + (Some(a), Some(b)) => format!("{}{}", a.as_str(), b.as_str()), + (None, Some(b)) => format!("{}", b.as_str()), + (Some(a), None) => format!("{}", a.as_str()), + _ => break, + }; + pos += 1; + tag = tag.replace( + &replacement, + &extra_spaces.replace_all(&replacement, " ").into_owned(), + ); + } + let mut output = tag.clone(); + for caps in extra_spaces2.find_iter(&tag) { + let c = caps.as_str().chars().next().unwrap_or('\0'); + output = output.replace( + caps.as_str(), + &format!( + "{} {}", + if c == '\0' { + String::new() + } else { + format!("{}", c) + }, + caps.as_str()[1..].trim_start() + ), + ); + } + tag = quotes_in_tag + .replace_all(&output, |caps: &Captures| match &caps[1] { + "width" | "height" => format!("{}={}", &caps[1], &caps[2]), + x => format!("{}=\"{}\"", x, &caps[2]), + }) + .into_owned(); + if cap != tag { + tag + } else { + cap + } + } + }) + .trim() + .to_owned() +} + +/// Returns a minified version of the provided HTML source. +pub fn minify(source: &str) -> String { + let source = remove_comments(source); + let source = condense(&source); + let source = clean_unneeded_tags(&source); + let source = condense_whitespace(&source); + unquote_attributes(&source).trim().to_owned() +} + +#[test] +fn html_minify_test() { + let source = r##"<head> + <title>Some huge title</title> + <link rel="stylesheet" type="text/css" href="something.css" > + <style type="text/css"> + .some_class { + color: red; + } + </style> +</head> +<body> + <header> + <div> + <i> <b><a href="www.somewhere.com" class="some_class">Narnia</a> </b> </i> + <h1 style="width:100%;text-align:center;" >Big header</h1> + </div> + <!-- commeeeeeeeents !!! --> + </header> + <div id="some_id"> + <!-- another comment + on +multi +lines --> + <div id="another_id" class="another_class" width="100"> + <h2>A little sub title</h2> + <ul> + <li>A list!</li> + <li>Who doesn't like lists?</li> + <li height="12" class="fooool">Well, who cares...</li> + </ul> + </div> + </div> + <script type="text/javascript" > + console.log("foo"); + </script> + <style type="text/css" src="../foo.css"> + <script src="../foo.js"> +</body> +"##; + + let expected_result = "<title>Some huge title</title> <link rel=\"stylesheet\" \ + type=\"text/css\" href=\"something.css\"> <style> .some_class \ + { color: red; } </style> <header> <div> <i> <b><a \ + href=\"www.somewhere.com\" class=\"some_class\">Narnia</a> </b> </i> \ + <h1 style=\"width:100%;text-align:center;\">Big header</h1> </div> \ + </header> <div id=\"some_id\"> <div id=\"another_id\" \ + class=\"another_class\" width=100> <h2>A little sub \ + title</h2> <ul> <li>A list! <li>Who doesn't like lists? \ + <li height=12 class=\"fooool\">Well, who cares... </ul> </div> \ + </div> <script > console.log(\"foo\"); </script> <style \ + src=\"../foo.css\"> <script src=\"../foo.js\">"; + assert_eq!(minify(source), expected_result); +} + +#[test] +fn html_keep_important_comments() { + let source = r#" +<div> + <!-- normal comment --> + <div>content</div> + <!--[if lte IE 8]> + <div class="warning">This old browser is unsupported and will most likely display funky things. + </div> + <![endif]--> +</div> +"#; + + let expected_result = + "<div> <div>content</div> <!--[if lte IE 8]> <div class=\"warning\">This \ + old browser is unsupported and will most likely display funky things. \ + </div> <![endif]--> </div>"; + assert_eq!(minify(source), expected_result); +} |