summaryrefslogtreecommitdiffstats
path: root/intl/uconv/tests/unit/test_charset_conversion.js
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
commit2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
treeb80bf8bf13c3766139fbacc530efd0dd9d54394c /intl/uconv/tests/unit/test_charset_conversion.js
parentInitial commit. (diff)
downloadfirefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz
firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/uconv/tests/unit/test_charset_conversion.js')
-rw-r--r--intl/uconv/tests/unit/test_charset_conversion.js373
1 files changed, 373 insertions, 0 deletions
diff --git a/intl/uconv/tests/unit/test_charset_conversion.js b/intl/uconv/tests/unit/test_charset_conversion.js
new file mode 100644
index 0000000000..8f1793ca18
--- /dev/null
+++ b/intl/uconv/tests/unit/test_charset_conversion.js
@@ -0,0 +1,373 @@
+const NS_ERROR_ILLEGAL_VALUE = Cr.NS_ERROR_ILLEGAL_VALUE;
+
+var BIS, BOS, _Pipe, COS, FIS, _SS, CIS;
+
+var dataDir;
+
+function run_test() {
+ BIS = Components.Constructor(
+ "@mozilla.org/binaryinputstream;1",
+ "nsIBinaryInputStream",
+ "setInputStream"
+ );
+ BOS = Components.Constructor(
+ "@mozilla.org/binaryoutputstream;1",
+ "nsIBinaryOutputStream",
+ "setOutputStream"
+ );
+ _Pipe = Components.Constructor("@mozilla.org/pipe;1", "nsIPipe", "init");
+ COS = Components.Constructor(
+ "@mozilla.org/intl/converter-output-stream;1",
+ "nsIConverterOutputStream",
+ "init"
+ );
+ FIS = Components.Constructor(
+ "@mozilla.org/network/file-input-stream;1",
+ "nsIFileInputStream",
+ "init"
+ );
+ _SS = Components.Constructor(
+ "@mozilla.org/storagestream;1",
+ "nsIStorageStream",
+ "init"
+ );
+ CIS = Components.Constructor(
+ "@mozilla.org/intl/converter-input-stream;1",
+ "nsIConverterInputStream",
+ "init"
+ );
+
+ dataDir = do_get_file("data/");
+
+ test_utf8_1();
+ test_cross_conversion();
+}
+
+const UNICODE_STRINGS = [
+ "\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE",
+
+ "AZaz09 \u007F " + // U+000000 to U+00007F
+ "\u0080 \u0398 \u03BB \u0725 " + // U+000080 to U+0007FF
+ "\u0964 \u0F5F \u20AC \uFFFB", // U+000800 to U+00FFFF
+
+ // there would be strings containing non-BMP code points here, but
+ // unfortunately JS strings are UCS-2 (and worse yet are treated as
+ // 16-bit values by the spec), so we have to do gymnastics to work
+ // with non-BMP -- manual surrogate decoding doesn't work because
+ // String.prototype.charCodeAt() ignores surrogate pairs and only
+ // returns 16-bit values
+];
+
+// test conversion equality -- keys are names of files containing equivalent
+// Unicode data, values are the encoding of the file in the format expected by
+// nsIConverter(In|Out)putStream.init
+const UNICODE_FILES = {
+ "unicode-conversion.utf8.txt": "UTF-8",
+ "unicode-conversion.utf16.txt": "UTF-16",
+ "unicode-conversion.utf16le.txt": "UTF-16LE",
+ "unicode-conversion.utf16be.txt": "UTF-16BE",
+};
+
+function test_utf8_1() {
+ for (var i = 0; i < UNICODE_STRINGS.length; i++) {
+ var pipe = Pipe();
+ var conv = new COS(pipe.outputStream, "UTF-8");
+ Assert.ok(conv.writeString(UNICODE_STRINGS[i]));
+ conv.close();
+
+ if (
+ !equalStreams(
+ new UTF8(pipe.inputStream),
+ stringToCodePoints(UNICODE_STRINGS[i])
+ )
+ ) {
+ do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
+ }
+ }
+}
+
+function test_cross_conversion() {
+ for (var fn1 in UNICODE_FILES) {
+ var fin = getBinaryInputStream(fn1);
+ var ss = StorageStream();
+
+ var bos = new BOS(ss.getOutputStream(0));
+ var av;
+ while ((av = fin.available()) > 0) {
+ var data = fin.readByteArray(av);
+ bos.writeByteArray(data);
+ }
+ fin.close();
+ bos.close();
+
+ for (var fn2 in UNICODE_FILES) {
+ var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]);
+ var unichar = new CIS(
+ ss.newInputStream(0),
+ UNICODE_FILES[fn1],
+ 8192,
+ 0x0
+ );
+
+ if (!equalUnicharStreams(unichar, fin2)) {
+ do_throw(
+ "unequal streams: " + UNICODE_FILES[fn1] + ", " + UNICODE_FILES[fn2]
+ );
+ }
+ }
+ }
+}
+
+// utility functions
+
+function StorageStream() {
+ return new _SS(8192, Math.pow(2, 32) - 1, null);
+}
+
+function getUnicharInputStream(filename, encoding) {
+ var file = dataDir.clone();
+ file.append(filename);
+
+ const PR_RDONLY = 0x1;
+ var fis = new FIS(
+ file,
+ PR_RDONLY,
+ "0644",
+ Ci.nsIFileInputStream.CLOSE_ON_EOF
+ );
+ return new CIS(fis, encoding, 8192, 0x0);
+}
+
+function getBinaryInputStream(filename, encoding) {
+ var file = dataDir.clone();
+ file.append(filename);
+
+ const PR_RDONLY = 0x1;
+ var fis = new FIS(
+ file,
+ PR_RDONLY,
+ "0644",
+ Ci.nsIFileInputStream.CLOSE_ON_EOF
+ );
+ return new BIS(fis);
+}
+
+function equalStreams(stream, codePoints) {
+ var currIndex = 0;
+ while (true) {
+ var unit = stream.readUnit();
+ if (unit < 0) {
+ return currIndex == codePoints.length;
+ }
+ if (unit !== codePoints[currIndex++]) {
+ return false;
+ }
+ }
+ // eslint-disable-next-line no-unreachable
+ do_throw("not reached");
+ return false;
+}
+
+function equalUnicharStreams(s1, s2) {
+ var r1, r2;
+ var str1 = {},
+ str2 = {};
+ while (true) {
+ r1 = s1.readString(1024, str1);
+ r2 = s2.readString(1024, str2);
+
+ if (r1 != r2 || str1.value != str2.value) {
+ print("r1: " + r1 + ", r2: " + r2);
+ print(str1.value.length);
+ print(str2.value.length);
+ return false;
+ }
+ if (r1 == 0 && r2 == 0) {
+ return true;
+ }
+ }
+
+ // not reached
+ // eslint-disable-next-line no-unreachable
+ return false;
+}
+
+function stringToCodePoints(str) {
+ return str.split("").map(function(v) {
+ return v.charCodeAt(0);
+ });
+}
+
+function lowbits(n) {
+ return Math.pow(2, n) - 1;
+}
+
+function Pipe() {
+ return new _Pipe(false, false, 1024, 10, null);
+}
+
+// complex charset readers
+
+/**
+ * Wraps a UTF-8 stream to allow access to the Unicode code points in it.
+ *
+ * @param stream
+ * the stream to wrap
+ */
+function UTF8(stream) {
+ this._stream = new BIS(stream);
+}
+UTF8.prototype = {
+ // returns numeric code point at front of stream encoded in UTF-8, -1 if at
+ // end of stream, or throws if valid (and properly encoded!) code point not
+ // found
+ readUnit() {
+ var str = this._stream;
+
+ var c, c2, c3, c4, rv;
+
+ // if at end of stream, must distinguish failure to read any bytes
+ // (correct behavior) from failure to read some byte after the first
+ // in the character
+ try {
+ c = str.read8();
+ } catch (e) {
+ return -1;
+ }
+
+ if (c < 0x80) {
+ return c;
+ }
+
+ if (c < 0xc0) {
+ // c < 11000000
+ // byte doesn't have enough leading ones (must be at least two)
+ throw NS_ERROR_ILLEGAL_VALUE;
+ }
+
+ c2 = str.read8();
+ if (c2 >= 0xc0 || c2 < 0x80) {
+ throw NS_ERROR_ILLEGAL_VALUE;
+ } // not 10xxxxxx
+
+ if (c < 0xe0) {
+ // c < 11100000
+ // two-byte between U+000080 and U+0007FF
+ rv = ((lowbits(5) & c) << 6) + (lowbits(6) & c2);
+ // no upper bounds-check needed, by previous lines
+ if (rv >= 0x80) {
+ return rv;
+ }
+ throw NS_ERROR_ILLEGAL_VALUE;
+ }
+
+ c3 = str.read8();
+ if (c3 >= 0xc0 || c3 < 0x80) {
+ throw NS_ERROR_ILLEGAL_VALUE;
+ } // not 10xxxxxx
+
+ if (c < 0xf0) {
+ // c < 11110000
+ // three-byte between U+000800 and U+00FFFF
+ rv =
+ ((lowbits(4) & c) << 12) + ((lowbits(6) & c2) << 6) + (lowbits(6) & c3);
+ // no upper bounds-check needed, by previous lines
+ if (rv >= 0xe000 || (rv >= 0x800 && rv <= 0xd7ff)) {
+ return rv;
+ }
+ throw NS_ERROR_ILLEGAL_VALUE;
+ }
+
+ c4 = str.read8();
+ if (c4 >= 0xc0 || c4 < 0x80) {
+ throw NS_ERROR_ILLEGAL_VALUE;
+ } // not 10xxxxxx
+
+ if (c < 0xf8) {
+ // c < 11111000
+ // four-byte between U+010000 and U+10FFFF
+ rv =
+ ((lowbits(3) & c) << 18) +
+ ((lowbits(6) & c2) << 12) +
+ ((lowbits(6) & c3) << 6) +
+ (lowbits(6) & c4);
+ // need an upper bounds-check since 0x10FFFF isn't (2**n - 1)
+ if (rv >= 0x10000 && rv <= 0x10ffff) {
+ return rv;
+ }
+ throw NS_ERROR_ILLEGAL_VALUE;
+ }
+
+ // 11111000 or greater -- no UTF-8 mapping
+ throw NS_ERROR_ILLEGAL_VALUE;
+ },
+};
+
+/**
+ * Wraps a UTF-16 stream to allow access to the Unicode code points in it.
+ *
+ * @param stream
+ * the stream to wrap
+ * @param bigEndian
+ * true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with
+ * a byte-order mark
+ */
+function UTF16(stream, bigEndian) {
+ this._stream = new BIS(stream);
+ if (arguments.length > 1) {
+ this._bigEndian = bigEndian;
+ } else {
+ var bom = this._stream.read16();
+ if (bom == 0xfeff) {
+ this._bigEndian = true;
+ } else if (bom == 0xfffe) {
+ this._bigEndian = false;
+ } else {
+ do_throw("missing BOM: " + bom.toString(16).toUpperCase());
+ }
+ }
+}
+UTF16.prototype = {
+ // returns numeric code point at front of stream encoded in UTF-16,
+ // -1 if at end of stream, or throws if UTF-16 code point not found
+ readUnit() {
+ var str = this._stream;
+
+ // if at end of stream, must distinguish failure to read any bytes
+ // (correct behavior) from failure to read some byte after the first
+ // in the character
+ try {
+ var b1 = str.read8();
+ } catch (e) {
+ return -1;
+ }
+
+ var b2 = str.read8();
+
+ var w1 = this._bigEndian ? (b1 << 8) + b2 : (b2 << 8) + b1;
+
+ if (w1 > 0xdbff && w1 < 0xe000) {
+ // second surrogate, but expecting none or first
+ throw NS_ERROR_ILLEGAL_VALUE;
+ }
+
+ if (w1 > 0xd7ff && w1 < 0xdc00) {
+ // non-BMP, use surrogate pair
+ b1 = str.read8();
+ b2 = str.read8();
+ var w2 = this._bigEndian ? (b1 << 8) + b2 : (b2 << 8) + b1;
+ if (w2 < 0xdc00 || w2 > 0xdfff) {
+ throw NS_ERROR_ILLEGAL_VALUE;
+ }
+
+ var rv = 0x100000 + ((lowbits(10) & w2) << 10) + (lowbits(10) & w1);
+ if (rv <= 0x10ffff) {
+ return rv;
+ }
+ throw NS_ERROR_ILLEGAL_VALUE;
+ }
+
+ // non-surrogate
+ return w1;
+ },
+};