summaryrefslogtreecommitdiffstats
path: root/nselib/punycode.lua
blob: ec389d45af4116d42a1176a5b9646eea1a248c00 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
---
-- Library methods for handling punycode strings.
--
-- Punycode is a simple and efficient transfer encoding syntax designed
-- for use with Internationalized Domain Names in Applications (IDNA).
-- It uniquely and reversibly transforms a Unicode string into an ASCII
-- string.  ASCII characters in the Unicode string are represented
-- literally, and non-ASCII characters are represented by ASCII
-- characters that are allowed in host name labels (letters, digits, and
-- hyphens).  This document defines a general algorithm called
-- Bootstring that allows a string of basic code points to uniquely
-- represent any string of code points drawn from a larger set.
-- Punycode is an instance of Bootstring that uses particular parameter
-- values specified by this document, appropriate for IDNA.
--
-- Advantages of Bootstring algorithm are Completeness, Uniqueness,
-- Reversibility, Efficient encoding, Simplicity and Readability.
--
-- Portions of this library were adapted from punycode.js by Mathias Bynens
-- under the MIT License.
--
-- References:
-- * http://ietf.org/rfc/rfc3492.txt
-- * punycode.js: https://mths.be/punycode
--
-- @author Rewanth Cool
-- @copyright Same as Nmap--See https://nmap.org/book/man-legal.html

local stdnse = require "stdnse"
local string = require "string"
local math = require "math"
local table = require "table"
local unicode = require "unicode"
local unittest = require "unittest"

_ENV = stdnse.module("punycode", stdnse.seeall)

-- Localize few functions for a tiny speed boost, since these will be
-- used frequently.
local floor = math.floor
local byte = string.byte
local char = string.char
local find = string.find
local match = string.match
local reverse = string.reverse
local sub = string.sub

-- Highest positive signed 32-bit float value
local maxInt = 0x7FFFFFFF

-- Bootstring parameters
local base = 0x24
local tMin = 0x1
local tMax = 0x1A
local skew = 0x26
local damp = 0x2BC
local initialBias = 0x48
local initialN = 0x80
local delimiter = char("0x2D")

-- Convenience shortcuts
local baseMinusTMin = base - tMin

-- Bias adaptation function as per section 3.4 of RFC 3492.
-- https://tools.ietf.org/html/rfc3492#section-3.4
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
local function adapt(delta, numPoints, firstTime)

  local k = 0;

  if firstTime then
    delta = floor(delta / damp)
  else
    delta = (delta >> 1)
  end

  delta = delta + floor(delta / numPoints)

  while delta > (baseMinusTMin * tMax >> 1) do
    delta = floor(delta / baseMinusTMin)
    k = k + base
  end

  return floor(k + (baseMinusTMin + 1) * delta / (delta + skew))

end

-- The following function converts boolean value to integer.
--
-- @param status boolean value is given as input.
-- @return Returns 0/1 based on the given boolean input.
local function boolToNum(status)

  if status == true then
    return 1
  else
    return 0
  end

end

-- This function converts a basic code point into a digit/integer.
--
-- @param codePoint The basic numeric code point value.
-- @return The numeric value of a basic code point (for use in
-- representing integers) in the range `0` to `base - 1`, or `base` if
-- the code point does not represent a value.
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
local function basicToDigit(codePoint)

  if (codePoint - 0x30 < 0x0A) then
    return codePoint - 0x16
  end
  if (codePoint - 0x41 < 0x1A) then
    return codePoint - 0x41
  end
  if (codePoint - 0x61 < 0x1A) then
    return codePoint - 0x61
  end

  return base

end


-- This function converts a digit/integer into a basic code point.
--
-- @param digit The numeric value of a basic code point.
-- @return The basic code point whose value (when used for
-- representing integers) is `digit`, which needs to be in the range
-- `0` to `base - 1`. If `flag` is non-zero, the uppercase form is
-- used; else, the lowercase form is used. The behavior is undefined
-- if `flag` is non-zero and `digit` has no uppercase form.
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
local function digitToBasic(digit, flag)
  --  0..25 map to ASCII a..z or A..Z
  -- 26..35 map to ASCII 0..9
  return digit + 22 + 75 * boolToNum(digit < 26) - (boolToNum((flag ~= 0)) << 5)
end

-- Creates a string based on an array of numeric code points.
--
-- @param input list-table of Unicode code points
-- @param decoder Sets the decoding format to be used.
-- @return The new encoded string
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
function encode_input(input)

  local output = {}

  -- Cache the length.
  local inputLength = #input

  -- Initialize the state.
  local n = initialN
  local delta = 0
  local bias = initialBias

  -- Handle the basic code points.
  for _, v in ipairs(input) do
    if v < 0x80 then
      table.insert(output, char(v))
    end
  end

  local basicLength = #output
  local handledCPCount = basicLength

  -- `handledCPCount` is the number of code points that have been handled
  -- `basicLength` is the number of basic code points.
  -- Finish the basic string with a delimiter unless it's empty.
  if (basicLength > 0) then
      table.insert(output, delimiter)
  end

  -- Main encoding loop:
  while (handledCPCount < inputLength) do
    -- All non-basic code points < n have been handled already. Find
    -- the next larger one:
    local m = maxInt
    for _, v in ipairs(input) do
      if v >= n and v < m then
        m = v
      end
    end

    -- Increase `delta` enough to advance the decoder's <n,i> state to
    -- <m,0>, but guard against overflow.
    local handledCPCountPlusOne = handledCPCount + 1
    if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) then
      --error('overflow')
      return nil, "Overflow exception occurred."
    end

    delta = delta + (m - n) * handledCPCountPlusOne
    n = m

    for _, currentValue in ipairs(input) do

      if currentValue < n then
        delta = delta + 1 --Move this down incase of wrong answer
        if delta > maxInt then
          --error("overflow")
          return nil, "Overflow exception occurred."
        end
      end

      if (currentValue == n) then
        -- Represent delta as a generalized variable-length integer.
        local q = delta
        local k = base

        repeat
          local t

          if k <= bias then
            t = tMin
          else
            if k >= bias + tMax then
              t = tMax
            else
              t = k - bias
            end
          end

          if q < t then
            break
          end

          local qMinusT = q - t
          local baseMinusT = base - t
          local ans = digitToBasic(t + qMinusT % baseMinusT, 0)

          table.insert(output, char(ans))

          q = floor(qMinusT / baseMinusT)

          k = k + base
        until false

        local ans = digitToBasic(q, 0)
        table.insert(output, char(ans))
        bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength)

        delta = 0
        handledCPCount = handledCPCount + 1
      end
    end

    delta = delta + 1
    n = n + 1

  end

  return table.concat(output, '')

end

-- Converts a Punycode string of ASCII-only symbols to a
-- list-table of Unicode code points.
--
-- @param input The Punycode string of ASCII-only symbols.
-- @return The resulting list-table of Unicode code points.
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
function decode_input(input)

  local output = {}
  local inputLength = #input
  local i = 0
  local n = initialN
  local bias = initialBias

  local basic
  if find(reverse(input), delimiter) then
    basic = #input - find(reverse(input), delimiter)
  else
    basic = -1
  end

  if basic < 0 then
    basic = 0
  end

  for j = 1, basic do
    local c = sub(input, j, j)
    local value = byte(c)

    if value >= 0x80 then
      --error("Not basic")
      return nil, "Not basic exception occurred."
    end
    table.insert(output, value)
  end

  local index
  if basic > 0 then
    index = basic + 1
  else
    index = 0
  end

  while index < inputLength do
    local oldi = i
    local w = 1
    local k = base

    repeat

      if index >= inputLength then
        --error("Invalid input")
        return nil, "Invalid input exception occurred."
      end

      local c = sub(input, index+1, index+1)
      local value = byte(c)
      local digit = basicToDigit(value)

      index = index + 1

      if (digit >= base or digit > floor((maxInt - i) / w)) then
        --error('overflow');
        return nil, "Overflow exception occurred."
      end
      i = i + digit * w;

      local t
      if k <= bias then
        t = tMin
      else
        if k >= bias + tMax then
          t = tMax
        else
          t = k - bias
        end
      end

      if digit < t then
        break
      end

      local baseMinusT = base - t;
      if (w > floor(maxInt / baseMinusT)) then
        --error('overflow');
        return nil, "Overflow exception occurred."
      end

      w = w * baseMinusT;
      k = k + base

    until false

    local out = #output + 1;

    bias = adapt(i - oldi, out, oldi == 0)

    -- `i` was supposed to wrap around from `out` to `0`,
    -- incrementing `n` each time, so we'll fix that now:
    if (floor(i / out) > maxInt - n) then
      --error('overflow');
      return nil, "Overflow exception occurred."
    end

    n = n + floor(i / out);
    i = i % out;
    for temp = #output, i, -1 do
      output[temp+1] = output[temp]
    end
    output[i+1] = n
    i = i + 1
  end

  return output

end

-- Performs punycode encoding on a label
--
-- If the label is already ASCII, it is returned as a string. If any encoding
-- was required, the "xn--" prefix is added.
--
-- @param u A list-table of Unicode code points representing a domain label
-- @return A punycode-encoded ASCII string
function encode_label(u)

  local flag = false

  -- Looks for non-ASCII character
  for _, val in pairs(u) do

    if not (val >=0 and val <= 127) then
      flag = true
      break
    end

  end

  if flag then

    local res, err = encode_input(u)
    if err then
      return nil, err
    end

    return 'xn--' .. res

  else
    return unicode.encode(u, unicode.utf8_enc)
  end

end

--- Decodes a punycode-encoded label to Unicode.
--
-- If the label starts with "xn--", it will be punycode-decoded. Otherwise, it
-- will be decoded as UTF-8 (ASCII). The return value is always a table of
-- Unicode code points.
--
-- @param s String of input.
-- @return A table of Unicode code points.
function decode_label(s)

  if match(s, "^xn%-%-") then

    local res, err = decode_input(sub(s, 5))
    if err then
      return nil, err
    end

    return res

  else
    return unicode.decode(s, unicode.utf8_dec)
  end

end

--Ignore the rest if we are not testing.
if not unittest.testing() then
  return _ENV
end

-- Table of punycode test cases.
local testCases = {
  {
    "xn--0zwm56d",
    "\xe6\xb5\x8b\xe8\xaf\x95"
  },
  {
    "xn--knigsgsschen-lcb0w",
    "k\xc3\xb6nigsg\xc3\xa4sschen"
  },
  {
    "xn--ab-fsf",
    "a\xe0\xa5\x8db"
  },
  {
    "xn--maana-pta",
    "ma\xc3\xb1ana"
  },
  {
    "xn----dqo34k",
    "\xe2\x98\x83-\xe2\x8c\x98"
  }
}

test_suite = unittest.TestSuite:new()

-- Running test cases against Encoding function.
for i, v in ipairs(testCases) do
  test_suite:add_test(unittest.equal(unicode.encode(decode_label(v[1]), unicode.utf8_enc), v[2]))
  test_suite:add_test(unittest.equal(encode_label(unicode.decode(v[2], unicode.utf8_dec)), v[1]))
end

return _ENV