/** * @fileoverview * Canonicalization functions used in the RTE test suite. * * Copyright 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the 'License') * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an 'AS IS' BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * @version 0.1 * @author rolandsteiner@google.com */ /** * Canonicalize HTML entities to their actual character * * @param str {String} the HTML string to be canonicalized * @return {String} the canonicalized string */ function canonicalizeEntities(str) { // TODO(rolandsteiner): this function is very much not optimized, but that shouldn't // theoretically matter too much - look into it at some point. var match; while (match = str.match(/&#x([0-9A-F]+);/i)) { str = str.replace('&#x' + match[1] + ';', String.fromCharCode(parseInt(match[1], 16))); } while (match = str.match(/&#([0-9]+);/)) { str = str.replace('&#' + match[1] + ';', String.fromCharCode(Number(match[1]))); } return str; } /** * Canonicalize the contents of the HTML 'style' attribute. * I.e. sorts the CSS attributes alphabetically and canonicalizes the values * CSS attributes where necessary. * * If this would return an empty string, return null instead to suppress the * whole 'style' attribute. * * Avoid tests that contain {, } or : within CSS values! * * Note that this function relies on the spaces of the input string already * having been normalized by canonicalizeSpaces! * * FIXME: does not canonicalize the contents of compound attributes * (e.g., 'border'). * * @param str {String} contents of the 'style' attribute * @param emitFlags {Object} flags used for this output * @return {String/null} canonicalized string, null instead of the empty string */ function canonicalizeStyle(str, emitFlags) { // Remove any enclosing curly brackets str = str.replace(/ ?[\{\}] ?/g, ''); var attributes = str.split(';'); var count = attributes.length; var resultArr = []; for (var a = 0; a < count; ++a) { // Retrieve "name: value" pair // Note: may expectedly fail if the last pair was terminated with ';' var avPair = attributes[a].match(/ ?([^ :]+) ?: ?(.+)/); if (!avPair) continue; var name = avPair[1]; var value = avPair[2].replace(/ $/, ''); // Remove any trailing space. switch (name) { case 'color': case 'background-color': case 'border-color': if (emitFlags.canonicalizeUnits) { resultArr.push(name + ': #' + new Color(value).toHexString()); } else { resultArr.push(name + ': ' + value); } break; case 'font-family': if (emitFlags.canonicalizeUnits) { resultArr.push(name + ': ' + new FontName(value).toString()); } else { resultArr.push(name + ': ' + value); } break; case 'font-size': if (emitFlags.canonicalizeUnits) { resultArr.push(name + ': ' + new FontSize(value).toString()); } else { resultArr.push(name + ': ' + value); } break; default: resultArr.push(name + ': ' + value); } } // Sort by name, assuming no duplicate CSS attribute names. resultArr.sort(); return resultArr.join('; ') || null; } /** * Canonicalize a single attribute value. * * Note that this function relies on the spaces of the input string already * having been normalized by canonicalizeSpaces! * * @param elemName {String} the name of the element * @param attrName {String} the name of the attribute * @param attrValue {String} the value of the attribute * @param emitFlags {Object} flags used for this output * @return {String/null} the canonicalized value, or null if the attribute should be skipped. */ function canonicalizeSingleAttribute(elemName, attrName, attrValue, emitFlags) { // We emit attributes as name="value", so change any contained apostrophes // to quote marks. attrValue = attrValue.replace(/\x22/, '\x27'); switch (attrName) { case 'class': return emitFlags.emitClass ? attrValue : null; case 'id': if (!emitFlags.emitID) { return null; } if (attrValue && attrValue.substr(0, 7) == 'editor-') { return null; } return attrValue; // Remove empty style attributes, canonicalize the contents otherwise, // provided the test cares for styles. case 'style': return (emitFlags.emitStyle && attrValue) ? canonicalizeStyle(attrValue, emitFlags) : null; // Never output onload handlers as they are set by the test environment. case 'onload': return null; // Canonicalize colors. case 'bgcolor': case 'color': if (!attrValue) { return null; } return emitFlags.canonicalizeUnits ? new Color(attrValue).toString() : attrValue; // Canonicalize font names. case 'face': return emitFlags.canonicalizeUnits ? new FontName(attrValue).toString() : attrValue; // Canonicalize font sizes (leave other 'size' attributes as-is). case 'size': if (!attrValue) { return null; } switch (elemName) { case 'basefont': case 'font': return emitFlags.canonicalizeUnits ? new FontSize(attrValue).toString() : attrValue; } return attrValue; // Remove spans with value 1. Retain spans with other values, even if // empty or with a value 0, since those indicate a flawed implementation. case 'colspan': case 'rowspan': case 'span': return (attrValue == '1' || attrValue === '') ? null : attrValue; // Boolean attributes: presence equals true. If present, the value must be // the empty string or the attribute's canonical name. // (http://www.whatwg.org/specs/web-apps/current-work/#boolean-attributes) // Below we only normalize empty string to the canonical name for // comparison purposes. All other values are not touched and will therefore // in all likelihood result in a failed test (even if they may be accepted // by the UA). case 'async': case 'autofocus': case 'checked': case 'compact': case 'declare': case 'defer': case 'disabled': case 'formnovalidate': case 'frameborder': case 'ismap': case 'loop': case 'multiple': case 'nohref': case 'nosize': case 'noshade': case 'novalidate': case 'nowrap': case 'open': case 'readonly': case 'required': case 'reversed': case 'seamless': case 'selected': return attrValue ? attrValue : attrName; default: return attrValue; } } /** * Canonicalize the contents of an element tag. * * I.e. sorts the attributes alphabetically and canonicalizes their * values where necessary. Also removes attributes we're not interested in. * * Note that this function relies on the spaces of the input string already * having been normalized by canonicalizeSpaces! * * @param str {String} the contens of the element tag, excluding < and >. * @param emitFlags {Object} flags used for this output * @return {String} the canonicalized contents. */ function canonicalizeElementTag(str, emitFlags) { // FIXME: lowercase only if emitFlags.lowercase is set str = str.toLowerCase(); var pos = str.search(' '); // element name only if (pos == -1) { return str; } var elemName = str.substr(0, pos); str = str.substr(pos + 1); // Even if emitFlags.emitAttrs is not set, we must iterate over the // attributes to catch the special selection attribute and/or selection // markers. :( // Iterate over attributes, add them to an array, canonicalize their // contents, and finally output the (remaining) attributes in sorted order. // Note: We can't do a simple split on space here, because the value of, // e.g., 'style' attributes may also contain spaces. var attrs = []; var selStartInTag = false; var selEndInTag = false; while (str) { var attrName; var attrValue = ''; pos = str.search(/[ =]/); if (pos >= 0) { attrName = str.substr(0, pos); if (str.charAt(pos) == ' ') { ++pos; } if (str.charAt(pos) == '=') { ++pos; if (str.charAt(pos) == ' ') { ++pos; } str = str.substr(pos); switch (str.charAt(0)) { case '"': case "'": pos = str.indexOf(str.charAt(0), 1); pos = (pos < 0) ? str.length : pos; attrValue = str.substring(1, pos); ++pos; break; default: pos = str.indexOf(' ', 0); pos = (pos < 0) ? str.length : pos; attrValue = (pos == -1) ? str : str.substr(0, pos); break; } attrValue = attrValue.replace(/^ /, ''); attrValue = attrValue.replace(/ $/, ''); } } else { attrName = str; } str = (pos == -1 || pos >= str.length) ? '' : str.substr(pos + 1); // Remove special selection attributes. switch (attrName) { case ATTRNAME_SEL_START: selStartInTag = true; continue; case ATTRNAME_SEL_END: selEndInTag = true; continue; } switch (attrName) { case '': case 'onload': case 'xmlns': break; default: if (!emitFlags.emitAttrs) { break; } // >>> fall through >>> case 'contenteditable': attrValue = canonicalizeEntities(attrValue); attrValue = canonicalizeSingleAttribute(elemName, attrName, attrValue, emitFlags); if (attrValue !== null) { attrs.push(attrName + '="' + attrValue + '"'); } } } var result = elemName; // Sort alphabetically (on full string rather than just attribute value for // simplicity. Also, attribute names will differ when encountering the '='). if (attrs.length > 0) { attrs.sort(); result += ' ' + attrs.join(' '); } // Add intra-tag selection marker(s) or attribute(s), if any, at the end. if (selStartInTag && selEndInTag) { result += ' |'; } else if (selStartInTag) { result += ' {'; } else if (selEndInTag) { result += ' }'; } return result; } /** * Canonicalize elements and attributes to facilitate comparison to the * expectation string: sort attributes, canonicalize values and remove chaff. * * Note that this function relies on the spaces of the input string already * having been normalized by canonicalizeSpaces! * * @param str {String} the HTML string to be canonicalized * @param emitFlags {Object} flags used for this output * @return {String} the canonicalized string */ function canonicalizeElementsAndAttributes(str, emitFlags) { var tagStart = str.indexOf('<'); var tagEnd = 0; var result = ''; while (tagStart >= 0) { ++tagStart; if (str.charAt(tagStart) == '/') { ++tagStart; } result = result + canonicalizeEntities(str.substring(tagEnd, tagStart)); tagEnd = str.indexOf('>', tagStart); if (tagEnd < 0) { tagEnd = str.length - 1; } if (str.charAt(tagEnd - 1) == '/') { --tagEnd; } var elemStr = str.substring(tagStart, tagEnd); elemStr = canonicalizeElementTag(elemStr, emitFlags); result = result + elemStr; tagStart = str.indexOf('<', tagEnd); } return result + canonicalizeEntities(str.substring(tagEnd)); } /** * Canonicalize an innerHTML string to uniform single whitespaces. * * FIXME: running this prevents testing for pre-formatted content * and the CSS 'white-space' attribute. * * @param str {String} the HTML string to be canonicalized * @return {String} the canonicalized string */ function canonicalizeSpaces(str) { // Collapse sequential whitespace. str = str.replace(/\s+/g, ' '); // Remove spaces immediately inside angle brackets <, >, . // While doing this also canonicalize <.../> to <...>. str = str.replace(/\< ?/g, '<'); str = str.replace(/\<\/ ?/g, '/g, '>'); return str; } /** * Canonicalize an innerHTML string to uniform single whitespaces. * Also remove comments to retain only embedded selection markers, and * remove
and if present. * * FIXME: running this prevents testing for pre-formatted content * and the CSS 'white-space' attribute. * * @param str {String} the HTML string to be canonicalized * @return {String} the canonicalized string */ function initialCanonicalizationOf(str) { str = canonicalizeSpaces(str); str = str.replace(/ ? ?/g, ''); str = str.replace(/<\/[bh]r>/g, ''); return str; }