intl/uconv/nsTextToSubURI.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsString.h"
#include "nsITextToSubURI.h"
#include "nsEscape.h"
#include "nsTextToSubURI.h"
#include "nsCRT.h"
#include "mozilla/ArrayUtils.h"
#include "mozilla/Encoding.h"
#include "mozilla/Preferences.h"
#include "mozilla/TextUtils.h"
#include "mozilla/Utf8.h"

using namespace mozilla;

nsTextToSubURI::~nsTextToSubURI() = default;

NS_IMPL_ISUPPORTS(nsTextToSubURI, nsITextToSubURI)

NS_IMETHODIMP
nsTextToSubURI::ConvertAndEscape(const nsACString& aCharset,
                                 const nsAString& aText, nsACString& aOut) {
  auto encoding = Encoding::ForLabelNoReplacement(aCharset);
  if (!encoding) {
    aOut.Truncate();
    return NS_ERROR_UCONV_NOCONV;
  }
  nsresult rv;
  nsAutoCString intermediate;
  std::tie(rv, std::ignore) = encoding->Encode(aText, intermediate);
  if (NS_FAILED(rv)) {
    aOut.Truncate();
    return rv;
  }
  bool ok = NS_Escape(intermediate, aOut, url_XPAlphas);
  if (!ok) {
    aOut.Truncate();
    return NS_ERROR_OUT_OF_MEMORY;
  }
  return NS_OK;
}

NS_IMETHODIMP
nsTextToSubURI::UnEscapeAndConvert(const nsACString& aCharset,
                                   const nsACString& aText, nsAString& aOut) {
  auto encoding = Encoding::ForLabelNoReplacement(aCharset);
  if (!encoding) {
    aOut.Truncate();
    return NS_ERROR_UCONV_NOCONV;
  }
  nsAutoCString unescaped(aText);
  NS_UnescapeURL(unescaped);
  auto rv = encoding->DecodeWithoutBOMHandling(unescaped, aOut);
  if (NS_SUCCEEDED(rv)) {
    return NS_OK;
  }
  return rv;
}

static bool statefulCharset(const char* charset) {
  // HZ, UTF-7 and the CN and KR ISO-2022 variants are no longer in
  // mozilla-central but keeping them here just in case for the benefit of
  // comm-central.
  if (!nsCRT::strncasecmp(charset, "ISO-2022-", sizeof("ISO-2022-") - 1) ||
      !nsCRT::strcasecmp(charset, "UTF-7") ||
      !nsCRT::strcasecmp(charset, "HZ-GB-2312"))
    return true;

  return false;
}

// static
nsresult nsTextToSubURI::convertURItoUnicode(const nsCString& aCharset,
                                             const nsCString& aURI,
                                             nsAString& aOut) {
  // check for 7bit encoding the data may not be ASCII after we decode
  bool isStatefulCharset = statefulCharset(aCharset.get());

  if (!isStatefulCharset) {
    if (IsAscii(aURI)) {
      CopyASCIItoUTF16(aURI, aOut);
      return NS_OK;
    }
    if (IsUtf8(aURI)) {
      CopyUTF8toUTF16(aURI, aOut);
      return NS_OK;
    }
  }

  // empty charset could indicate UTF-8, but aURI turns out not to be UTF-8.
  NS_ENSURE_FALSE(aCharset.IsEmpty(), NS_ERROR_INVALID_ARG);

  auto encoding = Encoding::ForLabelNoReplacement(aCharset);
  if (!encoding) {
    aOut.Truncate();
    return NS_ERROR_UCONV_NOCONV;
  }
  return encoding->DecodeWithoutBOMHandlingAndWithoutReplacement(aURI, aOut);
}

NS_IMETHODIMP nsTextToSubURI::UnEscapeURIForUI(const nsACString& aURIFragment,
                                               bool aDontEscape,
                                               nsAString& _retval) {
  nsAutoCString unescapedSpec;
  // skip control octets (0x00 - 0x1f and 0x7f) when unescaping
  NS_UnescapeURL(PromiseFlatCString(aURIFragment),
                 esc_SkipControl | esc_AlwaysCopy, unescapedSpec);

  // in case of failure, return escaped URI
  // Test for != NS_OK rather than NS_FAILED, because incomplete multi-byte
  // sequences are also considered failure in this context
  if (convertURItoUnicode("UTF-8"_ns, unescapedSpec, _retval) != NS_OK) {
    // assume UTF-8 instead of ASCII  because hostname (IDN) may be in UTF-8
    CopyUTF8toUTF16(aURIFragment, _retval);
  }

  if (aDontEscape) {
    return NS_OK;
  }

  // If there are any characters that are unsafe for URIs, reescape those.
  if (mIDNBlocklist.IsEmpty()) {
    mozilla::net::InitializeBlocklist(mIDNBlocklist);
    // we allow SPACE and IDEOGRAPHIC SPACE in this method
    mozilla::net::RemoveCharFromBlocklist(u' ', mIDNBlocklist);
    mozilla::net::RemoveCharFromBlocklist(0x3000, mIDNBlocklist);
  }

  MOZ_ASSERT(!mIDNBlocklist.IsEmpty());
  const nsPromiseFlatString& unescapedResult = PromiseFlatString(_retval);
  nsString reescapedSpec;
  _retval = NS_EscapeURL(
      unescapedResult,
      [&](char16_t aChar) -> bool {
        return mozilla::net::CharInBlocklist(aChar, mIDNBlocklist);
      },
      reescapedSpec);

  return NS_OK;
}

NS_IMETHODIMP
nsTextToSubURI::UnEscapeNonAsciiURIJS(const nsACString& aCharset,
                                      const nsACString& aURIFragment,
                                      nsAString& _retval) {
  return UnEscapeNonAsciiURI(aCharset, aURIFragment, _retval);
}

// static
nsresult nsTextToSubURI::UnEscapeNonAsciiURI(const nsACString& aCharset,
                                             const nsACString& aURIFragment,
                                             nsAString& _retval) {
  nsAutoCString unescapedSpec;
  NS_UnescapeURL(PromiseFlatCString(aURIFragment),
                 esc_AlwaysCopy | esc_OnlyNonASCII, unescapedSpec);
  // leave the URI as it is if it's not UTF-8 and aCharset is not a ASCII
  // superset since converting "http:" with such an encoding is always a bad
  // idea.
  if (!IsUtf8(unescapedSpec) &&
      (aCharset.LowerCaseEqualsLiteral("utf-16") ||
       aCharset.LowerCaseEqualsLiteral("utf-16be") ||
       aCharset.LowerCaseEqualsLiteral("utf-16le") ||
       aCharset.LowerCaseEqualsLiteral("utf-7") ||
       aCharset.LowerCaseEqualsLiteral("x-imap4-modified-utf7"))) {
    CopyASCIItoUTF16(aURIFragment, _retval);
    return NS_OK;
  }

  nsresult rv =
      convertURItoUnicode(PromiseFlatCString(aCharset), unescapedSpec, _retval);
  // NS_OK_UDEC_MOREINPUT is a success code, so caller can't catch the error
  // if the string ends with a valid (but incomplete) sequence.
  return rv == NS_OK_UDEC_MOREINPUT ? NS_ERROR_UDEC_ILLEGALINPUT : rv;
}

//----------------------------------------------------------------------