From 6bf0a5cb5034a7e684dcc3500e841785237ce2dd Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 19:32:43 +0200 Subject: Adding upstream version 1:115.7.0. Signed-off-by: Daniel Baumann --- intl/uconv/components.conf | 34 +++ intl/uconv/crashtests/563618.html | 12 + intl/uconv/crashtests/crashtests.list | 2 + intl/uconv/crashtests/omt-non-utf-8-jsurl.html | 14 + intl/uconv/directory.txt | 32 +++ intl/uconv/moz.build | 32 +++ intl/uconv/nsConverterInputStream.cpp | 256 +++++++++++++++++ intl/uconv/nsConverterInputStream.h | 64 +++++ intl/uconv/nsConverterOutputStream.cpp | 115 ++++++++ intl/uconv/nsConverterOutputStream.h | 39 +++ intl/uconv/nsIScriptableUConv.idl | 79 ++++++ intl/uconv/nsITextToSubURI.idl | 60 ++++ intl/uconv/nsScriptableUConv.cpp | 256 +++++++++++++++++ intl/uconv/nsScriptableUConv.h | 34 +++ intl/uconv/nsTextToSubURI.cpp | 178 ++++++++++++ intl/uconv/nsTextToSubURI.h | 36 +++ intl/uconv/tests/gtest/TestShortRead.cpp | 109 ++++++++ intl/uconv/tests/gtest/moz.build | 11 + intl/uconv/tests/mochitest.ini | 14 + intl/uconv/tests/moz.build | 13 + intl/uconv/tests/stressgb.pl | 23 ++ intl/uconv/tests/test_big5_encoder.html | 43 +++ intl/uconv/tests/test_bug335816.html | 40 +++ intl/uconv/tests/test_bug843434.html | 27 ++ intl/uconv/tests/test_bug959058-1.html | 28 ++ intl/uconv/tests/test_bug959058-2.html | 28 ++ intl/uconv/tests/test_ncr_fallback.html | 74 +++++ .../tests/test_singlebyte_overconsumption.html | 33 +++ .../tests/test_unicode_noncharacterescapes.html | 303 ++++++++++++++++++++ .../tests/test_unicode_noncharacters_gb18030.html | 305 +++++++++++++++++++++ .../tests/test_unicode_noncharacters_utf8.html | 303 ++++++++++++++++++++ intl/uconv/tests/test_utf8_overconsumption.html | 39 +++ 32 files changed, 2636 insertions(+) create mode 100644 intl/uconv/components.conf create mode 100644 intl/uconv/crashtests/563618.html create mode 100644 intl/uconv/crashtests/crashtests.list create mode 100644 intl/uconv/crashtests/omt-non-utf-8-jsurl.html create mode 100644 intl/uconv/directory.txt create mode 100644 intl/uconv/moz.build create mode 100644 intl/uconv/nsConverterInputStream.cpp create mode 100644 intl/uconv/nsConverterInputStream.h create mode 100644 intl/uconv/nsConverterOutputStream.cpp create mode 100644 intl/uconv/nsConverterOutputStream.h create mode 100644 intl/uconv/nsIScriptableUConv.idl create mode 100644 intl/uconv/nsITextToSubURI.idl create mode 100644 intl/uconv/nsScriptableUConv.cpp create mode 100644 intl/uconv/nsScriptableUConv.h create mode 100644 intl/uconv/nsTextToSubURI.cpp create mode 100644 intl/uconv/nsTextToSubURI.h create mode 100644 intl/uconv/tests/gtest/TestShortRead.cpp create mode 100644 intl/uconv/tests/gtest/moz.build create mode 100644 intl/uconv/tests/mochitest.ini create mode 100644 intl/uconv/tests/moz.build create mode 100644 intl/uconv/tests/stressgb.pl create mode 100644 intl/uconv/tests/test_big5_encoder.html create mode 100644 intl/uconv/tests/test_bug335816.html create mode 100644 intl/uconv/tests/test_bug843434.html create mode 100644 intl/uconv/tests/test_bug959058-1.html create mode 100644 intl/uconv/tests/test_bug959058-2.html create mode 100644 intl/uconv/tests/test_ncr_fallback.html create mode 100644 intl/uconv/tests/test_singlebyte_overconsumption.html create mode 100644 intl/uconv/tests/test_unicode_noncharacterescapes.html create mode 100644 intl/uconv/tests/test_unicode_noncharacters_gb18030.html create mode 100644 intl/uconv/tests/test_unicode_noncharacters_utf8.html create mode 100644 intl/uconv/tests/test_utf8_overconsumption.html (limited to 'intl/uconv') diff --git a/intl/uconv/components.conf b/intl/uconv/components.conf new file mode 100644 index 0000000000..00686f661a --- /dev/null +++ b/intl/uconv/components.conf @@ -0,0 +1,34 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +Classes = [ + { + 'cid': '{2bc2ad62-ad5d-4b7b-a9db-f74ae203c527}', + 'contract_ids': ['@mozilla.org/intl/converter-input-stream;1'], + 'type': 'nsConverterInputStream', + 'headers': ['nsConverterInputStream.h'], + }, + { + 'cid': '{ff8780a5-bbb1-4bc5-8ee7-057e7bc5c925}', + 'contract_ids': ['@mozilla.org/intl/converter-output-stream;1'], + 'type': 'nsConverterOutputStream', + 'headers': ['/intl/uconv/nsConverterOutputStream.h'], + }, + { + 'cid': '{0a698c44-3bff-11d4-9649-00c0ca135b4e}', + 'contract_ids': ['@mozilla.org/intl/scriptableunicodeconverter'], + 'type': 'nsScriptableUnicodeConverter', + 'headers': ['/intl/uconv/nsScriptableUConv.h'], + }, + { + 'js_name': 'textToSubURI', + 'cid': '{8b042e22-6f87-11d3-b3c8-00805f8a6670}', + 'contract_ids': ['@mozilla.org/intl/texttosuburi;1'], + 'interfaces': ['nsITextToSubURI'], + 'type': 'nsTextToSubURI', + 'headers': ['/intl/uconv/nsTextToSubURI.h'], + }, +] diff --git a/intl/uconv/crashtests/563618.html b/intl/uconv/crashtests/563618.html new file mode 100644 index 0000000000..e36b664762 --- /dev/null +++ b/intl/uconv/crashtests/563618.html @@ -0,0 +1,12 @@ + + + + + Serbian Glyph Test + + + +

+ + diff --git a/intl/uconv/crashtests/crashtests.list b/intl/uconv/crashtests/crashtests.list new file mode 100644 index 0000000000..6c54a699c1 --- /dev/null +++ b/intl/uconv/crashtests/crashtests.list @@ -0,0 +1,2 @@ +load 563618.html +load omt-non-utf-8-jsurl.html diff --git a/intl/uconv/crashtests/omt-non-utf-8-jsurl.html b/intl/uconv/crashtests/omt-non-utf-8-jsurl.html new file mode 100644 index 0000000000..033e38a280 --- /dev/null +++ b/intl/uconv/crashtests/omt-non-utf-8-jsurl.html @@ -0,0 +1,14 @@ + + + + + Test for off the main thread non-UTF-8 javascript: URL + + + + + diff --git a/intl/uconv/directory.txt b/intl/uconv/directory.txt new file mode 100644 index 0000000000..2b6be7af7f --- /dev/null +++ b/intl/uconv/directory.txt @@ -0,0 +1,32 @@ +Directory Structure : +================================ + +idl - public .idl files +public - public header file +src - source directory of charset converter manager and utilities, and + charset converters for ISO-8859-1, CP1252, MacRoman and UTF-8 +tests - tests program and application for charset converter +tests/unit - xpcshell tests +tools - tools to build the tables used by the converters +util - utility functions used by the converters + +The following directories contain different charset converters: + +ucvcn - Simplified Chinese charsets - GB2312, HZ, ISO-2022-CN, GBK, GB18030 +ucvja - Japanese charsets - Shift-JIS, ISO-2022-JP, EUC-JP +ucvko - Korean charsets - ISO-2022-KR, EUC-KR, Johab +ucvlatin - Latin charsets and others - ISO-8859-x, CP1250-1258 + CP866, 874, KOI8, + Mac charsets, TIS620, UTF16 +ucvtw - Traditional Chinese charsets Set 1 - Big5 +ucvtw2 - Traditional Chinese charsets Set 2 - EUC-TW + +Within the directories containing charset converters: + +*.ut - tables used to convert to Unicode from a charset +*.uf - tables used to convert to a charset from Unicode + +The following directories are obsolete and should not be used: + +ucvth +ucvvt diff --git a/intl/uconv/moz.build b/intl/uconv/moz.build new file mode 100644 index 0000000000..f21e4055f9 --- /dev/null +++ b/intl/uconv/moz.build @@ -0,0 +1,32 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +TEST_DIRS += ["tests"] + +XPIDL_SOURCES += [ + "nsIScriptableUConv.idl", + "nsITextToSubURI.idl", +] + +XPIDL_MODULE = "uconv" + +EXPORTS += [ + "nsConverterInputStream.h", + "nsTextToSubURI.h", +] + +UNIFIED_SOURCES += [ + "nsConverterInputStream.cpp", + "nsConverterOutputStream.cpp", + "nsScriptableUConv.cpp", + "nsTextToSubURI.cpp", +] + +XPCOM_MANIFESTS += [ + "components.conf", +] + +FINAL_LIBRARY = "xul" diff --git a/intl/uconv/nsConverterInputStream.cpp b/intl/uconv/nsConverterInputStream.cpp new file mode 100644 index 0000000000..e3efdbc146 --- /dev/null +++ b/intl/uconv/nsConverterInputStream.cpp @@ -0,0 +1,256 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsConverterInputStream.h" +#include "nsIInputStream.h" +#include "nsReadLine.h" +#include "nsStreamUtils.h" + +#include +#include + +using namespace mozilla; + +#define CONVERTER_BUFFER_SIZE 8192 + +NS_IMPL_ISUPPORTS(nsConverterInputStream, nsIConverterInputStream, + nsIUnicharInputStream, nsIUnicharLineInputStream) + +NS_IMETHODIMP +nsConverterInputStream::Init(nsIInputStream* aStream, const char* aCharset, + int32_t aBufferSize, char16_t aReplacementChar) { + nsAutoCString label; + if (!aCharset) { + label.AssignLiteral("UTF-8"); + } else { + label = aCharset; + } + + auto encoding = Encoding::ForLabelNoReplacement(label); + if (!encoding) { + return NS_ERROR_UCONV_NOCONV; + } + // Previously, the implementation auto-switched only + // between the two UTF-16 variants and only when + // initialized with an endianness-unspecific label. + mConverter = encoding->NewDecoder(); + + size_t outputBufferSize; + if (aBufferSize <= 0) { + aBufferSize = CONVERTER_BUFFER_SIZE; + outputBufferSize = CONVERTER_BUFFER_SIZE; + } else { + // NetUtil.jsm assumes that if buffer size equals + // the input size, the whole stream will be processed + // as one readString. This is not true with encoding_rs, + // because encoding_rs might want to see space for a + // surrogate pair, so let's compute a larger output + // buffer length. + CheckedInt needed = mConverter->MaxUTF16BufferLength(aBufferSize); + if (!needed.isValid()) { + return NS_ERROR_OUT_OF_MEMORY; + } + outputBufferSize = needed.value(); + } + + // set up our buffers. + if (!mByteData.SetCapacity(aBufferSize, mozilla::fallible) || + !mUnicharData.SetLength(outputBufferSize, mozilla::fallible)) { + return NS_ERROR_OUT_OF_MEMORY; + } + + mInput = aStream; + mErrorsAreFatal = !aReplacementChar; + return NS_OK; +} + +NS_IMETHODIMP +nsConverterInputStream::Close() { + nsresult rv = mInput ? mInput->Close() : NS_OK; + mLineBuffer = nullptr; + mInput = nullptr; + mConverter = nullptr; + mByteData.Clear(); + mUnicharData.Clear(); + return rv; +} + +NS_IMETHODIMP +nsConverterInputStream::Read(char16_t* aBuf, uint32_t aCount, + uint32_t* aReadCount) { + NS_ASSERTION(mUnicharDataLength >= mUnicharDataOffset, "unsigned madness"); + uint32_t readCount = mUnicharDataLength - mUnicharDataOffset; + if (0 == readCount) { + // Fill the unichar buffer + readCount = Fill(&mLastErrorCode); + if (readCount == 0) { + *aReadCount = 0; + return mLastErrorCode; + } + } + if (readCount > aCount) { + readCount = aCount; + } + memcpy(aBuf, mUnicharData.Elements() + mUnicharDataOffset, + readCount * sizeof(char16_t)); + mUnicharDataOffset += readCount; + *aReadCount = readCount; + return NS_OK; +} + +NS_IMETHODIMP +nsConverterInputStream::ReadSegments(nsWriteUnicharSegmentFun aWriter, + void* aClosure, uint32_t aCount, + uint32_t* aReadCount) { + NS_ASSERTION(mUnicharDataLength >= mUnicharDataOffset, "unsigned madness"); + uint32_t codeUnitsToWrite = mUnicharDataLength - mUnicharDataOffset; + if (0 == codeUnitsToWrite) { + // Fill the unichar buffer + codeUnitsToWrite = Fill(&mLastErrorCode); + if (codeUnitsToWrite == 0) { + *aReadCount = 0; + return mLastErrorCode; + } + } + + if (codeUnitsToWrite > aCount) { + codeUnitsToWrite = aCount; + } + + uint32_t codeUnitsWritten; + uint32_t totalCodeUnitsWritten = 0; + + while (codeUnitsToWrite) { + nsresult rv = + aWriter(this, aClosure, mUnicharData.Elements() + mUnicharDataOffset, + totalCodeUnitsWritten, codeUnitsToWrite, &codeUnitsWritten); + if (NS_FAILED(rv)) { + // don't propagate errors to the caller + break; + } + + codeUnitsToWrite -= codeUnitsWritten; + totalCodeUnitsWritten += codeUnitsWritten; + mUnicharDataOffset += codeUnitsWritten; + } + + *aReadCount = totalCodeUnitsWritten; + + return NS_OK; +} + +NS_IMETHODIMP +nsConverterInputStream::ReadString(uint32_t aCount, nsAString& aString, + uint32_t* aReadCount) { + NS_ASSERTION(mUnicharDataLength >= mUnicharDataOffset, "unsigned madness"); + uint32_t readCount = mUnicharDataLength - mUnicharDataOffset; + if (0 == readCount) { + // Fill the unichar buffer + readCount = Fill(&mLastErrorCode); + if (readCount == 0) { + *aReadCount = 0; + return mLastErrorCode; + } + } + if (readCount > aCount) { + readCount = aCount; + } + const char16_t* buf = mUnicharData.Elements() + mUnicharDataOffset; + aString.Assign(buf, readCount); + mUnicharDataOffset += readCount; + *aReadCount = readCount; + return NS_OK; +} + +uint32_t nsConverterInputStream::Fill(nsresult* aErrorCode) { + if (!mInput) { + // We already closed the stream! + *aErrorCode = NS_BASE_STREAM_CLOSED; + return 0; + } + + if (NS_FAILED(mLastErrorCode)) { + // We failed to completely convert last time, and error-recovery + // is disabled. We will fare no better this time, so... + *aErrorCode = mLastErrorCode; + return 0; + } + + // mUnicharData.Length() is the buffer length, not the fill status. + // mUnicharDataLength reflects the current fill status. + mUnicharDataLength = 0; + // Whenever we convert, mUnicharData is logically empty. + mUnicharDataOffset = 0; + + // Continue trying to read from the source stream until we successfully decode + // a character or encounter an error, as returning `0` here implies that the + // stream is complete. + // + // If the converter has been cleared, we've fully consumed the stream, and + // want to report EOF. + while (mUnicharDataLength == 0 && mConverter) { + // We assume a many to one conversion and are using equal sizes for + // the two buffers. However if an error happens at the very start + // of a byte buffer we may end up in a situation where n bytes lead + // to n+1 unicode chars. Thus we need to keep track of the leftover + // bytes as we convert. + + uint32_t nb; + *aErrorCode = NS_FillArray(mByteData, mInput, mLeftOverBytes, &nb); + if (NS_FAILED(*aErrorCode)) { + return 0; + } + + NS_ASSERTION(uint32_t(nb) + mLeftOverBytes == mByteData.Length(), + "mByteData is lying to us somewhere"); + + // If `NS_FillArray` failed to read any new bytes, this is the last read, + // and we're at the end of the stream. + bool last = (nb == 0); + + // Now convert as much of the byte buffer to unicode as possible + auto src = AsBytes(Span(mByteData)); + auto dst = Span(mUnicharData); + + // Truncation from size_t to uint32_t below is OK, because the sizes + // are bounded by the lengths of mByteData and mUnicharData. + uint32_t result; + size_t read; + size_t written; + if (mErrorsAreFatal) { + std::tie(result, read, written) = + mConverter->DecodeToUTF16WithoutReplacement(src, dst, last); + } else { + std::tie(result, read, written, std::ignore) = + mConverter->DecodeToUTF16(src, dst, last); + } + mLeftOverBytes = mByteData.Length() - read; + mUnicharDataLength = written; + // Clear `mConverter` if we reached the end of the stream, as we can't + // call methods on it anymore. This will also signal EOF to the caller + // through the loop condition. + if (last) { + MOZ_ASSERT(mLeftOverBytes == 0, + "Failed to read all bytes on the last pass?"); + mConverter = nullptr; + } + // If we got a decode error, we're done. + if (result != kInputEmpty && result != kOutputFull) { + MOZ_ASSERT(mErrorsAreFatal, "How come DecodeToUTF16() reported error?"); + *aErrorCode = NS_ERROR_UDEC_ILLEGALINPUT; + return 0; + } + } + *aErrorCode = NS_OK; + return mUnicharDataLength; +} + +NS_IMETHODIMP +nsConverterInputStream::ReadLine(nsAString& aLine, bool* aResult) { + if (!mLineBuffer) { + mLineBuffer = MakeUnique>(); + } + return NS_ReadLine(this, mLineBuffer.get(), aLine, aResult); +} diff --git a/intl/uconv/nsConverterInputStream.h b/intl/uconv/nsConverterInputStream.h new file mode 100644 index 0000000000..55555fc679 --- /dev/null +++ b/intl/uconv/nsConverterInputStream.h @@ -0,0 +1,64 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsConverterInputStream_h +#define nsConverterInputStream_h + +#include "nsIInputStream.h" +#include "nsIConverterInputStream.h" +#include "nsIUnicharLineInputStream.h" +#include "nsTArray.h" +#include "nsCOMPtr.h" +#include "nsReadLine.h" +#include "mozilla/Encoding.h" +#include "mozilla/UniquePtr.h" + +#define NS_CONVERTERINPUTSTREAM_CONTRACTID \ + "@mozilla.org/intl/converter-input-stream;1" + +// {2BC2AD62-AD5D-4b7b-A9DB-F74AE203C527} +#define NS_CONVERTERINPUTSTREAM_CID \ + { \ + 0x2bc2ad62, 0xad5d, 0x4b7b, { \ + 0xa9, 0xdb, 0xf7, 0x4a, 0xe2, 0x3, 0xc5, 0x27 \ + } \ + } + +class nsConverterInputStream : public nsIConverterInputStream, + public nsIUnicharLineInputStream { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSIUNICHARINPUTSTREAM + NS_DECL_NSIUNICHARLINEINPUTSTREAM + NS_DECL_NSICONVERTERINPUTSTREAM + + nsConverterInputStream() + : mLastErrorCode(NS_OK), + mLeftOverBytes(0), + mUnicharDataOffset(0), + mUnicharDataLength(0), + mErrorsAreFatal(false), + mLineBuffer(nullptr) {} + + private: + virtual ~nsConverterInputStream() { Close(); } + + uint32_t Fill(nsresult* aErrorCode); + + mozilla::UniquePtr mConverter; + FallibleTArray mByteData; + FallibleTArray mUnicharData; + nsCOMPtr mInput; + + nsresult mLastErrorCode; + uint32_t mLeftOverBytes; + uint32_t mUnicharDataOffset; + uint32_t mUnicharDataLength; + bool mErrorsAreFatal; + + mozilla::UniquePtr > mLineBuffer; +}; + +#endif diff --git a/intl/uconv/nsConverterOutputStream.cpp b/intl/uconv/nsConverterOutputStream.cpp new file mode 100644 index 0000000000..a24adb0377 --- /dev/null +++ b/intl/uconv/nsConverterOutputStream.cpp @@ -0,0 +1,115 @@ +/* vim:set expandtab ts=4 sw=2 sts=2 cin: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsCOMPtr.h" +#include "nsIOutputStream.h" +#include "nsString.h" +#include "nsConverterOutputStream.h" +#include "mozilla/Encoding.h" + +using namespace mozilla; + +NS_IMPL_ISUPPORTS(nsConverterOutputStream, nsIUnicharOutputStream, + nsIConverterOutputStream) + +nsConverterOutputStream::~nsConverterOutputStream() { Close(); } + +NS_IMETHODIMP +nsConverterOutputStream::Init(nsIOutputStream* aOutStream, + const char* aCharset) { + MOZ_ASSERT(aOutStream, "Null output stream!"); + + const Encoding* encoding; + if (!aCharset) { + encoding = UTF_8_ENCODING; + } else { + encoding = Encoding::ForLabelNoReplacement(MakeStringSpan(aCharset)); + if (!encoding || encoding == UTF_16LE_ENCODING || + encoding == UTF_16BE_ENCODING) { + return NS_ERROR_UCONV_NOCONV; + } + } + + mConverter = encoding->NewEncoder(); + + mOutStream = aOutStream; + + return NS_OK; +} + +NS_IMETHODIMP +nsConverterOutputStream::Write(uint32_t aCount, const char16_t* aChars, + bool* aSuccess) { + if (!mOutStream) { + NS_ASSERTION(!mConverter, "Closed streams shouldn't have converters"); + return NS_BASE_STREAM_CLOSED; + } + MOZ_ASSERT(mConverter, "Must have a converter when not closed"); + uint8_t buffer[4096]; + auto dst = Span(buffer); + auto src = Span(aChars, aCount); + for (;;) { + uint32_t result; + size_t read; + size_t written; + std::tie(result, read, written, std::ignore) = + mConverter->EncodeFromUTF16(src, dst, false); + src = src.From(read); + uint32_t streamWritten; + nsresult rv = mOutStream->Write(reinterpret_cast(dst.Elements()), + written, &streamWritten); + *aSuccess = NS_SUCCEEDED(rv) && written == streamWritten; + if (!(*aSuccess)) { + return rv; + } + if (result == kInputEmpty) { + return NS_OK; + } + } +} + +NS_IMETHODIMP +nsConverterOutputStream::WriteString(const nsAString& aString, bool* aSuccess) { + int32_t inLen = aString.Length(); + nsAString::const_iterator i; + aString.BeginReading(i); + return Write(inLen, i.get(), aSuccess); +} + +NS_IMETHODIMP +nsConverterOutputStream::Flush() { + if (!mOutStream) return NS_OK; // Already closed. + + // If we are encoding to ISO-2022-JP, potentially + // transition back to the ASCII state. The buffer + // needs to be large enough for an additional NCR, + // though. + uint8_t buffer[12]; + auto dst = Span(buffer); + Span src(nullptr); + uint32_t result; + size_t written; + std::tie(result, std::ignore, written, std::ignore) = + mConverter->EncodeFromUTF16(src, dst, true); + MOZ_ASSERT(result == kInputEmpty); + uint32_t streamWritten; + if (!written) { + return NS_OK; + } + return mOutStream->Write(reinterpret_cast(dst.Elements()), written, + &streamWritten); +} + +NS_IMETHODIMP +nsConverterOutputStream::Close() { + if (!mOutStream) return NS_OK; // Already closed. + + nsresult rv1 = Flush(); + + nsresult rv2 = mOutStream->Close(); + mOutStream = nullptr; + mConverter = nullptr; + return NS_FAILED(rv1) ? rv1 : rv2; +} diff --git a/intl/uconv/nsConverterOutputStream.h b/intl/uconv/nsConverterOutputStream.h new file mode 100644 index 0000000000..74b873acd5 --- /dev/null +++ b/intl/uconv/nsConverterOutputStream.h @@ -0,0 +1,39 @@ +/* vim:set expandtab ts=4 sw=2 sts=2 cin: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef NSCONVERTEROUTPUTSTREAM_H_ +#define NSCONVERTEROUTPUTSTREAM_H_ + +#include "nsIConverterOutputStream.h" +#include "nsCOMPtr.h" +#include "mozilla/Attributes.h" +#include "mozilla/Encoding.h" + +class nsIOutputStream; + +/* ff8780a5-bbb1-4bc5-8ee7-057e7bc5c925 */ +#define NS_CONVERTEROUTPUTSTREAM_CID \ + { \ + 0xff8780a5, 0xbbb1, 0x4bc5, { \ + 0x8e, 0xe7, 0x05, 0x7e, 0x7b, 0xc5, 0xc9, 0x25 \ + } \ + } + +class nsConverterOutputStream final : public nsIConverterOutputStream { + public: + nsConverterOutputStream() = default; + + NS_DECL_ISUPPORTS + NS_DECL_NSIUNICHAROUTPUTSTREAM + NS_DECL_NSICONVERTEROUTPUTSTREAM + + private: + ~nsConverterOutputStream(); + + mozilla::UniquePtr mConverter; + nsCOMPtr mOutStream; +}; + +#endif diff --git a/intl/uconv/nsIScriptableUConv.idl b/intl/uconv/nsIScriptableUConv.idl new file mode 100644 index 0000000000..f4557dce8b --- /dev/null +++ b/intl/uconv/nsIScriptableUConv.idl @@ -0,0 +1,79 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.idl" + +interface nsIInputStream; + +%{C++ +// {0A698C44-3BFF-11d4-9649-00C0CA135B4E} +#define NS_ISCRIPTABLEUNICODECONVERTER_CID { 0x0A698C44, 0x3BFF, 0x11d4, { 0x96, 0x49, 0x00, 0xC0, 0xCA, 0x13, 0x5B, 0x4E } } +#define NS_ISCRIPTABLEUNICODECONVERTER_CONTRACTID "@mozilla.org/intl/scriptableunicodeconverter" +%} + +/** + * In new code, please use the WebIDL TextDecoder and TextEncoder + * instead. They represent bytes as Uint8Array (or as view to such + * array), which is the current best practice for representing bytes + * in JavaScript. + * + * This interface converts between UTF-16 in JavaScript strings + * and bytes transported as the unsigned value of each byte + * transported in a code unit of the same numeric value in + * a JavaScript string. + * + * @created 8/Jun/2000 + * @author Makoto Kato [m_kato@ga2.so-net.ne.jp] + */ +[scriptable, uuid(f36ee324-5c1c-437f-ba10-2b4db7a18031)] +interface nsIScriptableUnicodeConverter : nsISupports +{ + /** + * Converts the data from Unicode to one Charset. + * Returns the converted string. After converting, Finish should be called + * and its return value appended to this return value. + */ + ACString ConvertFromUnicode(in AString aSrc); + + /** + * Returns the terminator string. + * Should be called after ConvertFromUnicode() and appended to that + * function's return value. + */ + ACString Finish(); + + /** + * Converts the data from one Charset to Unicode. + */ + AString ConvertToUnicode(in ACString aSrc); + + /** + * Convert a unicode string to an array of bytes. Finish does not need to be + * called. + */ + void convertToByteArray(in AString aString, + [optional] out unsigned long aLen, + [array, size_is(aLen),retval] out octet aData); + + /** + * Converts a unicode string to an input stream. The bytes in the stream are + * encoded according to the charset attribute. + * The returned stream will be nonblocking. + */ + nsIInputStream convertToInputStream(in AString aString); + + /** + * Current character set. + * + * @throw NS_ERROR_UCONV_NOCONV + * The requested charset is not supported. + */ + attribute ACString charset; + + /** + * Meaningless + */ + attribute boolean isInternal; +}; diff --git a/intl/uconv/nsITextToSubURI.idl b/intl/uconv/nsITextToSubURI.idl new file mode 100644 index 0000000000..3bb404e414 --- /dev/null +++ b/intl/uconv/nsITextToSubURI.idl @@ -0,0 +1,60 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.idl" + + +%{C++ +// {8B042E22-6F87-11d3-B3C8-00805F8A6670} +#define NS_TEXTTOSUBURI_CID { 0x8b042e22, 0x6f87, 0x11d3, { 0xb3, 0xc8, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } } +#define NS_ITEXTTOSUBURI_CONTRACTID "@mozilla.org/intl/texttosuburi;1" +%} + +[scriptable, uuid(8B042E24-6F87-11d3-B3C8-00805F8A6670)] +interface nsITextToSubURI : nsISupports +{ + ACString ConvertAndEscape(in ACString charset, in AString text); + AString UnEscapeAndConvert(in ACString charset, in ACString text); + + /** + * Unescapes the given URI fragment (for UI purpose only) + * Note: + *
    + *
  • escaping back the result (unescaped string) is not guaranteed to + * give the original escaped string + *
  • The URI fragment (escaped) is assumed to be in UTF-8 and converted + * to AString (UTF-16) + *
  • In case of successful conversion any resulting character listed + * in netwerk/dns/IDNCharacterBlocklist.inc (except space) is escaped + *
  • Always succeeeds (callers don't need to do error checking) + *
+ * + * @param aURIFragment the URI (or URI fragment) to unescape + * @param aDontEscape whether to escape IDN blocklisted characters + * @return Unescaped aURIFragment converted to unicode + */ + AString unEscapeURIForUI(in AUTF8String aURIFragment, + [optional] in boolean aDontEscape); +%{C++ + nsresult UnEscapeURIForUI(const nsACString& aURIFragment, + nsAString& _retval) { + return UnEscapeURIForUI(aURIFragment, false, _retval); + } +%} + + /** + * Unescapes only non ASCII characters in the given URI fragment + * note: this method assumes the URI as UTF-8 and fallbacks to the given + * charset if the charset is an ASCII superset + * + * @param aCharset the charset to convert from + * @param aURIFragment the URI (or URI fragment) to unescape + * @return Unescaped aURIFragment converted to unicode + * @throws NS_ERROR_UCONV_NOCONV when there is no decoder for aCharset + * or NS_ERROR_UDEC_ILLEGALINPUT in case of conversion failure + */ + [binaryname(UnEscapeNonAsciiURIJS)] + AString unEscapeNonAsciiURI(in ACString aCharset, in AUTF8String aURIFragment); +}; diff --git a/intl/uconv/nsScriptableUConv.cpp b/intl/uconv/nsScriptableUConv.cpp new file mode 100644 index 0000000000..8a9638f2ce --- /dev/null +++ b/intl/uconv/nsScriptableUConv.cpp @@ -0,0 +1,256 @@ + +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsString.h" +#include "nsIScriptableUConv.h" +#include "nsScriptableUConv.h" +#include "nsIStringStream.h" +#include "nsComponentManagerUtils.h" + +#include + +using namespace mozilla; + +/* Implementation file */ +NS_IMPL_ISUPPORTS(nsScriptableUnicodeConverter, nsIScriptableUnicodeConverter) + +nsScriptableUnicodeConverter::nsScriptableUnicodeConverter() + : mIsInternal(false) {} + +nsScriptableUnicodeConverter::~nsScriptableUnicodeConverter() = default; + +NS_IMETHODIMP +nsScriptableUnicodeConverter::ConvertFromUnicode(const nsAString& aSrc, + nsACString& _retval) { + if (!mEncoder) return NS_ERROR_FAILURE; + + // We can compute the length without replacement, because the + // the replacement is only one byte long and a mappable character + // would always output something, i.e. at least one byte. + // When encoding to ISO-2022-JP, unmappables shouldn't be able + // to cause more escape sequences to be emitted than the mappable + // worst case where every input character causes an escape into + // a different state. + CheckedInt needed = + mEncoder->MaxBufferLengthFromUTF16WithoutReplacement(aSrc.Length()); + if (!needed.isValid() || needed.value() > UINT32_MAX) { + return NS_ERROR_OUT_OF_MEMORY; + } + + auto dstChars = _retval.GetMutableData(needed.value(), fallible); + if (!dstChars) { + return NS_ERROR_OUT_OF_MEMORY; + } + + auto src = Span(aSrc); + auto dst = AsWritableBytes(*dstChars); + size_t totalWritten = 0; + for (;;) { + auto [result, read, written] = + mEncoder->EncodeFromUTF16WithoutReplacement(src, dst, false); + if (result != kInputEmpty && result != kOutputFull) { + MOZ_RELEASE_ASSERT(written < dst.Length(), + "Unmappables with one-byte replacement should not " + "exceed mappable worst case."); + dst[written++] = '?'; + } + totalWritten += written; + if (result == kInputEmpty) { + MOZ_ASSERT(totalWritten <= UINT32_MAX); + if (!_retval.SetLength(totalWritten, fallible)) { + return NS_ERROR_OUT_OF_MEMORY; + } + return NS_OK; + } + src = src.From(read); + dst = dst.From(written); + } +} + +NS_IMETHODIMP +nsScriptableUnicodeConverter::Finish(nsACString& _retval) { + // The documentation for this method says it should be called after + // ConvertFromUnicode(). However, our own tests called it after + // convertFromByteArray(), i.e. when *decoding*. + // Assuming that there exists extensions that similarly call + // this at the wrong time, let's deal. In general, it is a design + // error for this class to handle conversions in both directions. + if (!mEncoder) { + _retval.Truncate(); + mDecoder->Encoding()->NewDecoderWithBOMRemovalInto(*mDecoder); + return NS_OK; + } + // If we are encoding to ISO-2022-JP, potentially + // transition back to the ASCII state. The buffer + // needs to be large enough for an additional NCR, + // though. + _retval.SetLength(13); + auto dst = AsWritableBytes(_retval.GetMutableData(13)); + Span src(nullptr); + uint32_t result; + size_t read; + size_t written; + std::tie(result, read, written, std::ignore) = + mEncoder->EncodeFromUTF16(src, dst, true); + MOZ_ASSERT(!read); + MOZ_ASSERT(result == kInputEmpty); + _retval.SetLength(written); + + mDecoder->Encoding()->NewDecoderWithBOMRemovalInto(*mDecoder); + mEncoder->Encoding()->NewEncoderInto(*mEncoder); + return NS_OK; +} + +NS_IMETHODIMP +nsScriptableUnicodeConverter::ConvertToUnicode(const nsACString& aSrc, + nsAString& _retval) { + if (!mDecoder) return NS_ERROR_FAILURE; + + uint32_t length = aSrc.Length(); + + CheckedInt needed = mDecoder->MaxUTF16BufferLength(length); + if (!needed.isValid() || needed.value() > UINT32_MAX) { + return NS_ERROR_OUT_OF_MEMORY; + } + + auto dst = _retval.GetMutableData(needed.value(), fallible); + if (!dst) { + return NS_ERROR_OUT_OF_MEMORY; + } + + auto src = + Span(reinterpret_cast(aSrc.BeginReading()), length); + uint32_t result; + size_t read; + size_t written; + // The UTF-8 decoder used to throw regardless of the error behavior. + // Simulating the old behavior for compatibility with legacy callers. + // If callers want control over the behavior, they should switch to + // TextDecoder. + if (mDecoder->Encoding() == UTF_8_ENCODING) { + std::tie(result, read, written) = + mDecoder->DecodeToUTF16WithoutReplacement(src, *dst, false); + if (result != kInputEmpty) { + return NS_ERROR_UDEC_ILLEGALINPUT; + } + } else { + std::tie(result, read, written, std::ignore) = + mDecoder->DecodeToUTF16(src, *dst, false); + } + MOZ_ASSERT(result == kInputEmpty); + MOZ_ASSERT(read == length); + MOZ_ASSERT(written <= needed.value()); + if (!_retval.SetLength(written, fallible)) { + return NS_ERROR_OUT_OF_MEMORY; + } + return NS_OK; +} + +NS_IMETHODIMP +nsScriptableUnicodeConverter::ConvertToByteArray(const nsAString& aString, + uint32_t* aLen, + uint8_t** _aData) { + if (!mEncoder) return NS_ERROR_FAILURE; + + CheckedInt needed = + mEncoder->MaxBufferLengthFromUTF16WithoutReplacement(aString.Length()); + if (!needed.isValid() || needed.value() > UINT32_MAX) { + return NS_ERROR_OUT_OF_MEMORY; + } + + uint8_t* data = (uint8_t*)malloc(needed.value()); + if (!data) { + return NS_ERROR_OUT_OF_MEMORY; + } + auto src = Span(aString); + auto dst = Span(data, needed.value()); + size_t totalWritten = 0; + for (;;) { + auto [result, read, written] = + mEncoder->EncodeFromUTF16WithoutReplacement(src, dst, true); + if (result != kInputEmpty && result != kOutputFull) { + // There's always room for one byte in the case of + // an unmappable character, because otherwise + // we'd have gotten `kOutputFull`. + dst[written++] = '?'; + } + totalWritten += written; + if (result == kInputEmpty) { + *_aData = data; + MOZ_ASSERT(totalWritten <= UINT32_MAX); + *aLen = totalWritten; + return NS_OK; + } + src = src.From(read); + dst = dst.From(written); + } +} + +NS_IMETHODIMP +nsScriptableUnicodeConverter::ConvertToInputStream(const nsAString& aString, + nsIInputStream** _retval) { + nsresult rv; + nsCOMPtr inputStream = + do_CreateInstance("@mozilla.org/io/string-input-stream;1", &rv); + if (NS_FAILED(rv)) return rv; + + uint8_t* data; + uint32_t dataLen; + rv = ConvertToByteArray(aString, &dataLen, &data); + if (NS_FAILED(rv)) return rv; + + rv = inputStream->AdoptData(reinterpret_cast(data), dataLen); + if (NS_FAILED(rv)) { + free(data); + return rv; + } + + NS_ADDREF(*_retval = inputStream); + return rv; +} + +NS_IMETHODIMP +nsScriptableUnicodeConverter::GetCharset(nsACString& aCharset) { + if (!mDecoder) { + aCharset.Truncate(); + } else { + mDecoder->Encoding()->Name(aCharset); + } + return NS_OK; +} + +NS_IMETHODIMP +nsScriptableUnicodeConverter::SetCharset(const nsACString& aCharset) { + return InitConverter(aCharset); +} + +NS_IMETHODIMP +nsScriptableUnicodeConverter::GetIsInternal(bool* aIsInternal) { + *aIsInternal = mIsInternal; + return NS_OK; +} + +NS_IMETHODIMP +nsScriptableUnicodeConverter::SetIsInternal(const bool aIsInternal) { + mIsInternal = aIsInternal; + return NS_OK; +} + +nsresult nsScriptableUnicodeConverter::InitConverter( + const nsACString& aCharset) { + mEncoder = nullptr; + mDecoder = nullptr; + + auto encoding = Encoding::ForLabelNoReplacement(aCharset); + if (!encoding) { + return NS_ERROR_UCONV_NOCONV; + } + if (!(encoding == UTF_16LE_ENCODING || encoding == UTF_16BE_ENCODING)) { + mEncoder = encoding->NewEncoder(); + } + mDecoder = encoding->NewDecoderWithBOMRemoval(); + return NS_OK; +} diff --git a/intl/uconv/nsScriptableUConv.h b/intl/uconv/nsScriptableUConv.h new file mode 100644 index 0000000000..059a4b430c --- /dev/null +++ b/intl/uconv/nsScriptableUConv.h @@ -0,0 +1,34 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef __nsScriptableUConv_h_ +#define __nsScriptableUConv_h_ + +#include "nsIScriptableUConv.h" +#include "nsCOMPtr.h" +#include "mozilla/Encoding.h" + +class nsScriptableUnicodeConverter : public nsIScriptableUnicodeConverter { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSISCRIPTABLEUNICODECONVERTER + + nsScriptableUnicodeConverter(); + + protected: + virtual ~nsScriptableUnicodeConverter(); + + mozilla::UniquePtr mEncoder; + mozilla::UniquePtr mDecoder; + bool mIsInternal; + + nsresult FinishWithLength(char** _retval, int32_t* aLength); + nsresult ConvertFromUnicodeWithLength(const nsAString& aSrc, int32_t* aOutLen, + char** _retval); + + nsresult InitConverter(const nsACString& aCharset); +}; + +#endif diff --git a/intl/uconv/nsTextToSubURI.cpp b/intl/uconv/nsTextToSubURI.cpp new file mode 100644 index 0000000000..e70d9ccbd8 --- /dev/null +++ b/intl/uconv/nsTextToSubURI.cpp @@ -0,0 +1,178 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "nsString.h" +#include "nsITextToSubURI.h" +#include "nsEscape.h" +#include "nsTextToSubURI.h" +#include "nsCRT.h" +#include "mozilla/ArrayUtils.h" +#include "mozilla/Encoding.h" +#include "mozilla/Preferences.h" +#include "mozilla/TextUtils.h" +#include "mozilla/Utf8.h" + +using namespace mozilla; + +nsTextToSubURI::~nsTextToSubURI() = default; + +NS_IMPL_ISUPPORTS(nsTextToSubURI, nsITextToSubURI) + +NS_IMETHODIMP +nsTextToSubURI::ConvertAndEscape(const nsACString& aCharset, + const nsAString& aText, nsACString& aOut) { + auto encoding = Encoding::ForLabelNoReplacement(aCharset); + if (!encoding) { + aOut.Truncate(); + return NS_ERROR_UCONV_NOCONV; + } + nsresult rv; + nsAutoCString intermediate; + std::tie(rv, std::ignore) = encoding->Encode(aText, intermediate); + if (NS_FAILED(rv)) { + aOut.Truncate(); + return rv; + } + bool ok = NS_Escape(intermediate, aOut, url_XPAlphas); + if (!ok) { + aOut.Truncate(); + return NS_ERROR_OUT_OF_MEMORY; + } + return NS_OK; +} + +NS_IMETHODIMP +nsTextToSubURI::UnEscapeAndConvert(const nsACString& aCharset, + const nsACString& aText, nsAString& aOut) { + auto encoding = Encoding::ForLabelNoReplacement(aCharset); + if (!encoding) { + aOut.Truncate(); + return NS_ERROR_UCONV_NOCONV; + } + nsAutoCString unescaped(aText); + NS_UnescapeURL(unescaped); + auto rv = encoding->DecodeWithoutBOMHandling(unescaped, aOut); + if (NS_SUCCEEDED(rv)) { + return NS_OK; + } + return rv; +} + +static bool statefulCharset(const char* charset) { + // HZ, UTF-7 and the CN and KR ISO-2022 variants are no longer in + // mozilla-central but keeping them here just in case for the benefit of + // comm-central. + if (!nsCRT::strncasecmp(charset, "ISO-2022-", sizeof("ISO-2022-") - 1) || + !nsCRT::strcasecmp(charset, "UTF-7") || + !nsCRT::strcasecmp(charset, "HZ-GB-2312")) + return true; + + return false; +} + +// static +nsresult nsTextToSubURI::convertURItoUnicode(const nsCString& aCharset, + const nsCString& aURI, + nsAString& aOut) { + // check for 7bit encoding the data may not be ASCII after we decode + bool isStatefulCharset = statefulCharset(aCharset.get()); + + if (!isStatefulCharset) { + if (IsAscii(aURI)) { + CopyASCIItoUTF16(aURI, aOut); + return NS_OK; + } + if (IsUtf8(aURI)) { + CopyUTF8toUTF16(aURI, aOut); + return NS_OK; + } + } + + // empty charset could indicate UTF-8, but aURI turns out not to be UTF-8. + NS_ENSURE_FALSE(aCharset.IsEmpty(), NS_ERROR_INVALID_ARG); + + auto encoding = Encoding::ForLabelNoReplacement(aCharset); + if (!encoding) { + aOut.Truncate(); + return NS_ERROR_UCONV_NOCONV; + } + return encoding->DecodeWithoutBOMHandlingAndWithoutReplacement(aURI, aOut); +} + +NS_IMETHODIMP nsTextToSubURI::UnEscapeURIForUI(const nsACString& aURIFragment, + bool aDontEscape, + nsAString& _retval) { + nsAutoCString unescapedSpec; + // skip control octets (0x00 - 0x1f and 0x7f) when unescaping + NS_UnescapeURL(PromiseFlatCString(aURIFragment), + esc_SkipControl | esc_AlwaysCopy, unescapedSpec); + + // in case of failure, return escaped URI + // Test for != NS_OK rather than NS_FAILED, because incomplete multi-byte + // sequences are also considered failure in this context + if (convertURItoUnicode("UTF-8"_ns, unescapedSpec, _retval) != NS_OK) { + // assume UTF-8 instead of ASCII because hostname (IDN) may be in UTF-8 + CopyUTF8toUTF16(aURIFragment, _retval); + } + + if (aDontEscape) { + return NS_OK; + } + + // If there are any characters that are unsafe for URIs, reescape those. + if (mIDNBlocklist.IsEmpty()) { + mozilla::net::InitializeBlocklist(mIDNBlocklist); + // we allow SPACE and IDEOGRAPHIC SPACE in this method + mozilla::net::RemoveCharFromBlocklist(u' ', mIDNBlocklist); + mozilla::net::RemoveCharFromBlocklist(0x3000, mIDNBlocklist); + } + + MOZ_ASSERT(!mIDNBlocklist.IsEmpty()); + const nsPromiseFlatString& unescapedResult = PromiseFlatString(_retval); + nsString reescapedSpec; + _retval = NS_EscapeURL( + unescapedResult, + [&](char16_t aChar) -> bool { + return mozilla::net::CharInBlocklist(aChar, mIDNBlocklist); + }, + reescapedSpec); + + return NS_OK; +} + +NS_IMETHODIMP +nsTextToSubURI::UnEscapeNonAsciiURIJS(const nsACString& aCharset, + const nsACString& aURIFragment, + nsAString& _retval) { + return UnEscapeNonAsciiURI(aCharset, aURIFragment, _retval); +} + +// static +nsresult nsTextToSubURI::UnEscapeNonAsciiURI(const nsACString& aCharset, + const nsACString& aURIFragment, + nsAString& _retval) { + nsAutoCString unescapedSpec; + NS_UnescapeURL(PromiseFlatCString(aURIFragment), + esc_AlwaysCopy | esc_OnlyNonASCII, unescapedSpec); + // leave the URI as it is if it's not UTF-8 and aCharset is not a ASCII + // superset since converting "http:" with such an encoding is always a bad + // idea. + if (!IsUtf8(unescapedSpec) && + (aCharset.LowerCaseEqualsLiteral("utf-16") || + aCharset.LowerCaseEqualsLiteral("utf-16be") || + aCharset.LowerCaseEqualsLiteral("utf-16le") || + aCharset.LowerCaseEqualsLiteral("utf-7") || + aCharset.LowerCaseEqualsLiteral("x-imap4-modified-utf7"))) { + CopyASCIItoUTF16(aURIFragment, _retval); + return NS_OK; + } + + nsresult rv = + convertURItoUnicode(PromiseFlatCString(aCharset), unescapedSpec, _retval); + // NS_OK_UDEC_MOREINPUT is a success code, so caller can't catch the error + // if the string ends with a valid (but incomplete) sequence. + return rv == NS_OK_UDEC_MOREINPUT ? NS_ERROR_UDEC_ILLEGALINPUT : rv; +} + +//---------------------------------------------------------------------- diff --git a/intl/uconv/nsTextToSubURI.h b/intl/uconv/nsTextToSubURI.h new file mode 100644 index 0000000000..1eaeb554dc --- /dev/null +++ b/intl/uconv/nsTextToSubURI.h @@ -0,0 +1,36 @@ +// -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- +// vim: set ts=2 et sw=2 tw=80: +// This Source Code is subject to the terms of the Mozilla Public License +// version 2.0 (the "License"). You can obtain a copy of the License at +// http://mozilla.org/MPL/2.0/. +#ifndef nsTextToSubURI_h__ +#define nsTextToSubURI_h__ + +#include "nsITextToSubURI.h" +#include "nsString.h" +#include "nsTArray.h" +#include "mozilla/net/IDNBlocklistUtils.h" + +class nsTextToSubURI : public nsITextToSubURI { + NS_DECL_ISUPPORTS + NS_DECL_NSITEXTTOSUBURI + + // Thread-safe function for C++ callers + static nsresult UnEscapeNonAsciiURI(const nsACString& aCharset, + const nsACString& aURIFragment, + nsAString& _retval); + + private: + virtual ~nsTextToSubURI(); + + // We assume that the URI is encoded as UTF-8. + static nsresult convertURItoUnicode(const nsCString& aCharset, + const nsCString& aURI, + nsAString& _retval); + + // Characters defined in netwerk/dns/IDNCharacterBlocklist.inc or via the + // network.IDN.extra_allowed_chars and network.IDN.extra_blocked_chars prefs. + nsTArray mIDNBlocklist; +}; + +#endif // nsTextToSubURI_h__ diff --git a/intl/uconv/tests/gtest/TestShortRead.cpp b/intl/uconv/tests/gtest/TestShortRead.cpp new file mode 100644 index 0000000000..393f5e0027 --- /dev/null +++ b/intl/uconv/tests/gtest/TestShortRead.cpp @@ -0,0 +1,109 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "gtest/gtest.h" +#include "mozilla/ErrorNames.h" +#include "nsCOMPtr.h" +#include "nsConverterInputStream.h" +#include "nsIInputStream.h" +#include "nsISupports.h" +#include "nsStringStream.h" + +namespace { + +class ShortReadWrapper final : public nsIInputStream { + public: + NS_DECL_THREADSAFE_ISUPPORTS + NS_DECL_NSIINPUTSTREAM + + template + ShortReadWrapper(const uint32_t (&aShortReads)[N], + nsIInputStream* aBaseStream) + : mShortReadIter(std::begin(aShortReads)), + mShortReadEnd(std::end(aShortReads)), + mBaseStream(aBaseStream) {} + + ShortReadWrapper(const ShortReadWrapper&) = delete; + ShortReadWrapper& operator=(const ShortReadWrapper&) = delete; + + private: + ~ShortReadWrapper() = default; + + const uint32_t* mShortReadIter; + const uint32_t* mShortReadEnd; + nsCOMPtr mBaseStream; +}; + +NS_IMPL_ISUPPORTS(ShortReadWrapper, nsIInputStream) + +NS_IMETHODIMP +ShortReadWrapper::Close() { return mBaseStream->Close(); } + +NS_IMETHODIMP +ShortReadWrapper::Available(uint64_t* aAvailable) { + nsresult rv = mBaseStream->Available(aAvailable); + NS_ENSURE_SUCCESS(rv, rv); + + if (mShortReadIter != mShortReadEnd) { + *aAvailable = std::min(uint64_t(*mShortReadIter), *aAvailable); + } + return NS_OK; +} + +NS_IMETHODIMP +ShortReadWrapper::StreamStatus() { return mBaseStream->StreamStatus(); } + +NS_IMETHODIMP +ShortReadWrapper::Read(char* aBuf, uint32_t aCount, uint32_t* _retval) { + if (mShortReadIter != mShortReadEnd) { + aCount = std::min(*mShortReadIter, aCount); + } + + nsresult rv = mBaseStream->Read(aBuf, aCount, _retval); + if (NS_SUCCEEDED(rv) && mShortReadIter != mShortReadEnd) { + ++mShortReadIter; + } + return rv; +} + +NS_IMETHODIMP +ShortReadWrapper::ReadSegments(nsWriteSegmentFun aWriter, void* aClosure, + uint32_t aCount, uint32_t* _retval) { + return NS_ERROR_NOT_IMPLEMENTED; +} + +NS_IMETHODIMP +ShortReadWrapper::IsNonBlocking(bool* _retval) { + return mBaseStream->IsNonBlocking(_retval); +} + +} // namespace + +TEST(ConverterStreamShortRead, ShortRead) +{ + uint8_t bytes[] = {0xd8, 0x35, 0xdc, 0x20}; + nsCOMPtr baseStream; + ASSERT_TRUE(NS_SUCCEEDED(NS_NewByteInputStream(getter_AddRefs(baseStream), + AsChars(mozilla::Span(bytes)), + NS_ASSIGNMENT_COPY))); + + static const uint32_t kShortReads[] = {1, 2, 1}; + nsCOMPtr shortStream = + new ShortReadWrapper(kShortReads, baseStream); + + RefPtr unicharStream = new nsConverterInputStream(); + ASSERT_TRUE(NS_SUCCEEDED( + unicharStream->Init(shortStream, "UTF-16BE", 4096, + nsIConverterInputStream::ERRORS_ARE_FATAL))); + + uint32_t read; + nsAutoString result; + ASSERT_TRUE( + NS_SUCCEEDED(unicharStream->ReadString(UINT32_MAX, result, &read))); + + ASSERT_EQ(read, 2u); + ASSERT_TRUE(result == u"\U0001d420"); +} diff --git a/intl/uconv/tests/gtest/moz.build b/intl/uconv/tests/gtest/moz.build new file mode 100644 index 0000000000..969fb52c7e --- /dev/null +++ b/intl/uconv/tests/gtest/moz.build @@ -0,0 +1,11 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +UNIFIED_SOURCES += [ + "TestShortRead.cpp", +] + +FINAL_LIBRARY = "xul-gtest" diff --git a/intl/uconv/tests/mochitest.ini b/intl/uconv/tests/mochitest.ini new file mode 100644 index 0000000000..0f13e77971 --- /dev/null +++ b/intl/uconv/tests/mochitest.ini @@ -0,0 +1,14 @@ +[DEFAULT] + +[test_bug335816.html] +[test_bug843434.html] +[test_bug959058-1.html] +[test_bug959058-2.html] +[test_long_doc.html] +[test_singlebyte_overconsumption.html] +[test_unicode_noncharacterescapes.html] +[test_unicode_noncharacters_gb18030.html] +[test_unicode_noncharacters_utf8.html] +[test_utf8_overconsumption.html] +[test_big5_encoder.html] +[test_ncr_fallback.html] diff --git a/intl/uconv/tests/moz.build b/intl/uconv/tests/moz.build new file mode 100644 index 0000000000..888186fb26 --- /dev/null +++ b/intl/uconv/tests/moz.build @@ -0,0 +1,13 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +TEST_DIRS += [ + "gtest", +] + +XPCSHELL_TESTS_MANIFESTS += ["unit/xpcshell.ini"] + +MOCHITEST_MANIFESTS += ["mochitest.ini"] diff --git a/intl/uconv/tests/stressgb.pl b/intl/uconv/tests/stressgb.pl new file mode 100644 index 0000000000..5b37fb63fb --- /dev/null +++ b/intl/uconv/tests/stressgb.pl @@ -0,0 +1,23 @@ +#!/usr/bin/perl +use LWP::Simple; +use IO::Handle; +$stdout = *STDOUT; +open(RES , ">resultlog.txt") || die "cannot open result log file"; +#system("rm alldiff.txt in*.txt out*.txt"); +for($i=10;$i<909;$i++) +{ + RES->printf("Test Page %d \n", $i); + $url = "http://people.netscape.com/ftang/testscript/gb18030/gbtext.cgi?page=" . $i; + RES->printf( "URL = %s\n", $url); + $tmpfile = "> in". $i . ".txt"; + open STDOUT, $tmpfile || RES->print("cannot open " . $tmpfile . "\n"); + getprint $url; + $cmd2 = "../../../dist/win32_d.obj/bin/nsconv -f GB18030 -t GB18030 in" . $i . ".txt out" . $i . ".txt >err"; + $cmd3 = "diff -u in" . $i . ".txt out" . $i . ".txt >> alldiff.txt"; + RES->printf( "Run '%s'\n", $cmd2); + $st2 = system($cmd2); + RES->printf( "result = '%d'\n", $st2); + RES->printf( "Run '%s'\n", $cmd3); + $st3 = system($cmd3); + RES->printf( "result = '%d'\n", $st3); +} diff --git a/intl/uconv/tests/test_big5_encoder.html b/intl/uconv/tests/test_big5_encoder.html new file mode 100644 index 0000000000..7e86683f00 --- /dev/null +++ b/intl/uconv/tests/test_big5_encoder.html @@ -0,0 +1,43 @@ + + + + + + Test for Unicode non-characters + + + + +
+
+
+Mozilla Bug 912470 +

+ + + diff --git a/intl/uconv/tests/test_singlebyte_overconsumption.html b/intl/uconv/tests/test_singlebyte_overconsumption.html new file mode 100644 index 0000000000..3aeeb928ec --- /dev/null +++ b/intl/uconv/tests/test_singlebyte_overconsumption.html @@ -0,0 +1,33 @@ + + + + + + Test for undefined codepoints + + + + +
+
+
+Mozilla Bug 564679 +

Evil.

+ + -> + + diff --git a/intl/uconv/tests/test_unicode_noncharacterescapes.html b/intl/uconv/tests/test_unicode_noncharacterescapes.html new file mode 100644 index 0000000000..e44f8d782b --- /dev/null +++ b/intl/uconv/tests/test_unicode_noncharacterescapes.html @@ -0,0 +1,303 @@ + + + + + + Test for Unicode non-characters + + + + +
+
+
+Mozilla Bug 445886 +

All good.

+ + + diff --git a/intl/uconv/tests/test_unicode_noncharacters_gb18030.html b/intl/uconv/tests/test_unicode_noncharacters_gb18030.html new file mode 100644 index 0000000000..0c9156d9e3 --- /dev/null +++ b/intl/uconv/tests/test_unicode_noncharacters_gb18030.html @@ -0,0 +1,305 @@ + + + + + + Test for Unicode non-characters + + + + +

+
+Mozilla Bug +445886 +

All good.

+ + + diff --git a/intl/uconv/tests/test_unicode_noncharacters_utf8.html b/intl/uconv/tests/test_unicode_noncharacters_utf8.html new file mode 100644 index 0000000000..ecfdbeae09 --- /dev/null +++ b/intl/uconv/tests/test_unicode_noncharacters_utf8.html @@ -0,0 +1,303 @@ + + + + + + Test for Unicode non-characters + + + + +
+
+
+Mozilla Bug 445886 +

All good.

+ + + diff --git a/intl/uconv/tests/test_utf8_overconsumption.html b/intl/uconv/tests/test_utf8_overconsumption.html new file mode 100644 index 0000000000..25c4a273ea --- /dev/null +++ b/intl/uconv/tests/test_utf8_overconsumption.html @@ -0,0 +1,39 @@ + + + + + + Test for Unicode non-characters + + + + +onload="Inject()"> +
+
+
+Mozilla Bug 445886 +

All good.

+ + + -- cgit v1.2.3