1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
|
---
-- Library methods for handling punycode strings.
--
-- Punycode is a simple and efficient transfer encoding syntax designed
-- for use with Internationalized Domain Names in Applications (IDNA).
-- It uniquely and reversibly transforms a Unicode string into an ASCII
-- string. ASCII characters in the Unicode string are represented
-- literally, and non-ASCII characters are represented by ASCII
-- characters that are allowed in host name labels (letters, digits, and
-- hyphens). This document defines a general algorithm called
-- Bootstring that allows a string of basic code points to uniquely
-- represent any string of code points drawn from a larger set.
-- Punycode is an instance of Bootstring that uses particular parameter
-- values specified by this document, appropriate for IDNA.
--
-- Advantages of Bootstring algorithm are Completeness, Uniqueness,
-- Reversibility, Efficient encoding, Simplicity and Readability.
--
-- Portions of this library were adapted from punycode.js by Mathias Bynens
-- under the MIT License.
--
-- References:
-- * http://ietf.org/rfc/rfc3492.txt
-- * punycode.js: https://mths.be/punycode
--
-- @author Rewanth Cool
-- @copyright Same as Nmap--See https://nmap.org/book/man-legal.html
local stdnse = require "stdnse"
local string = require "string"
local math = require "math"
local table = require "table"
local unicode = require "unicode"
local unittest = require "unittest"
_ENV = stdnse.module("punycode", stdnse.seeall)
-- Localize few functions for a tiny speed boost, since these will be
-- used frequently.
local floor = math.floor
local byte = string.byte
local char = string.char
local find = string.find
local match = string.match
local reverse = string.reverse
local sub = string.sub
-- Highest positive signed 32-bit float value
local maxInt = 0x7FFFFFFF
-- Bootstring parameters
local base = 0x24
local tMin = 0x1
local tMax = 0x1A
local skew = 0x26
local damp = 0x2BC
local initialBias = 0x48
local initialN = 0x80
local delimiter = char("0x2D")
-- Convenience shortcuts
local baseMinusTMin = base - tMin
-- Bias adaptation function as per section 3.4 of RFC 3492.
-- https://tools.ietf.org/html/rfc3492#section-3.4
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
local function adapt(delta, numPoints, firstTime)
local k = 0;
if firstTime then
delta = floor(delta / damp)
else
delta = (delta >> 1)
end
delta = delta + floor(delta / numPoints)
while delta > (baseMinusTMin * tMax >> 1) do
delta = floor(delta / baseMinusTMin)
k = k + base
end
return floor(k + (baseMinusTMin + 1) * delta / (delta + skew))
end
-- The following function converts boolean value to integer.
--
-- @param status boolean value is given as input.
-- @return Returns 0/1 based on the given boolean input.
local function boolToNum(status)
if status == true then
return 1
else
return 0
end
end
-- This function converts a basic code point into a digit/integer.
--
-- @param codePoint The basic numeric code point value.
-- @return The numeric value of a basic code point (for use in
-- representing integers) in the range `0` to `base - 1`, or `base` if
-- the code point does not represent a value.
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
local function basicToDigit(codePoint)
if (codePoint - 0x30 < 0x0A) then
return codePoint - 0x16
end
if (codePoint - 0x41 < 0x1A) then
return codePoint - 0x41
end
if (codePoint - 0x61 < 0x1A) then
return codePoint - 0x61
end
return base
end
-- This function converts a digit/integer into a basic code point.
--
-- @param digit The numeric value of a basic code point.
-- @return The basic code point whose value (when used for
-- representing integers) is `digit`, which needs to be in the range
-- `0` to `base - 1`. If `flag` is non-zero, the uppercase form is
-- used; else, the lowercase form is used. The behavior is undefined
-- if `flag` is non-zero and `digit` has no uppercase form.
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
local function digitToBasic(digit, flag)
-- 0..25 map to ASCII a..z or A..Z
-- 26..35 map to ASCII 0..9
return digit + 22 + 75 * boolToNum(digit < 26) - (boolToNum((flag ~= 0)) << 5)
end
-- Creates a string based on an array of numeric code points.
--
-- @param input list-table of Unicode code points
-- @param decoder Sets the decoding format to be used.
-- @return The new encoded string
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
function encode_input(input)
local output = {}
-- Cache the length.
local inputLength = #input
-- Initialize the state.
local n = initialN
local delta = 0
local bias = initialBias
-- Handle the basic code points.
for _, v in ipairs(input) do
if v < 0x80 then
table.insert(output, char(v))
end
end
local basicLength = #output
local handledCPCount = basicLength
-- `handledCPCount` is the number of code points that have been handled
-- `basicLength` is the number of basic code points.
-- Finish the basic string with a delimiter unless it's empty.
if (basicLength > 0) then
table.insert(output, delimiter)
end
-- Main encoding loop:
while (handledCPCount < inputLength) do
-- All non-basic code points < n have been handled already. Find
-- the next larger one:
local m = maxInt
for _, v in ipairs(input) do
if v >= n and v < m then
m = v
end
end
-- Increase `delta` enough to advance the decoder's <n,i> state to
-- <m,0>, but guard against overflow.
local handledCPCountPlusOne = handledCPCount + 1
if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) then
--error('overflow')
return nil, "Overflow exception occurred."
end
delta = delta + (m - n) * handledCPCountPlusOne
n = m
for _, currentValue in ipairs(input) do
if currentValue < n then
delta = delta + 1 --Move this down incase of wrong answer
if delta > maxInt then
--error("overflow")
return nil, "Overflow exception occurred."
end
end
if (currentValue == n) then
-- Represent delta as a generalized variable-length integer.
local q = delta
local k = base
repeat
local t
if k <= bias then
t = tMin
else
if k >= bias + tMax then
t = tMax
else
t = k - bias
end
end
if q < t then
break
end
local qMinusT = q - t
local baseMinusT = base - t
local ans = digitToBasic(t + qMinusT % baseMinusT, 0)
table.insert(output, char(ans))
q = floor(qMinusT / baseMinusT)
k = k + base
until false
local ans = digitToBasic(q, 0)
table.insert(output, char(ans))
bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength)
delta = 0
handledCPCount = handledCPCount + 1
end
end
delta = delta + 1
n = n + 1
end
return table.concat(output, '')
end
-- Converts a Punycode string of ASCII-only symbols to a
-- list-table of Unicode code points.
--
-- @param input The Punycode string of ASCII-only symbols.
-- @return The resulting list-table of Unicode code points.
-- The following function is adapted from punycode.js by Mathias Bynens
-- under the MIT License.
function decode_input(input)
local output = {}
local inputLength = #input
local i = 0
local n = initialN
local bias = initialBias
local basic
if find(reverse(input), delimiter) then
basic = #input - find(reverse(input), delimiter)
else
basic = -1
end
if basic < 0 then
basic = 0
end
for j = 1, basic do
local c = sub(input, j, j)
local value = byte(c)
if value >= 0x80 then
--error("Not basic")
return nil, "Not basic exception occurred."
end
table.insert(output, value)
end
local index
if basic > 0 then
index = basic + 1
else
index = 0
end
while index < inputLength do
local oldi = i
local w = 1
local k = base
repeat
if index >= inputLength then
--error("Invalid input")
return nil, "Invalid input exception occurred."
end
local c = sub(input, index+1, index+1)
local value = byte(c)
local digit = basicToDigit(value)
index = index + 1
if (digit >= base or digit > floor((maxInt - i) / w)) then
--error('overflow');
return nil, "Overflow exception occurred."
end
i = i + digit * w;
local t
if k <= bias then
t = tMin
else
if k >= bias + tMax then
t = tMax
else
t = k - bias
end
end
if digit < t then
break
end
local baseMinusT = base - t;
if (w > floor(maxInt / baseMinusT)) then
--error('overflow');
return nil, "Overflow exception occurred."
end
w = w * baseMinusT;
k = k + base
until false
local out = #output + 1;
bias = adapt(i - oldi, out, oldi == 0)
-- `i` was supposed to wrap around from `out` to `0`,
-- incrementing `n` each time, so we'll fix that now:
if (floor(i / out) > maxInt - n) then
--error('overflow');
return nil, "Overflow exception occurred."
end
n = n + floor(i / out);
i = i % out;
for temp = #output, i, -1 do
output[temp+1] = output[temp]
end
output[i+1] = n
i = i + 1
end
return output
end
-- Performs punycode encoding on a label
--
-- If the label is already ASCII, it is returned as a string. If any encoding
-- was required, the "xn--" prefix is added.
--
-- @param u A list-table of Unicode code points representing a domain label
-- @return A punycode-encoded ASCII string
function encode_label(u)
local flag = false
-- Looks for non-ASCII character
for _, val in pairs(u) do
if not (val >=0 and val <= 127) then
flag = true
break
end
end
if flag then
local res, err = encode_input(u)
if err then
return nil, err
end
return 'xn--' .. res
else
return unicode.encode(u, unicode.utf8_enc)
end
end
--- Decodes a punycode-encoded label to Unicode.
--
-- If the label starts with "xn--", it will be punycode-decoded. Otherwise, it
-- will be decoded as UTF-8 (ASCII). The return value is always a table of
-- Unicode code points.
--
-- @param s String of input.
-- @return A table of Unicode code points.
function decode_label(s)
if match(s, "^xn%-%-") then
local res, err = decode_input(sub(s, 5))
if err then
return nil, err
end
return res
else
return unicode.decode(s, unicode.utf8_dec)
end
end
--Ignore the rest if we are not testing.
if not unittest.testing() then
return _ENV
end
-- Table of punycode test cases.
local testCases = {
{
"xn--0zwm56d",
"\xe6\xb5\x8b\xe8\xaf\x95"
},
{
"xn--knigsgsschen-lcb0w",
"k\xc3\xb6nigsg\xc3\xa4sschen"
},
{
"xn--ab-fsf",
"a\xe0\xa5\x8db"
},
{
"xn--maana-pta",
"ma\xc3\xb1ana"
},
{
"xn----dqo34k",
"\xe2\x98\x83-\xe2\x8c\x98"
}
}
test_suite = unittest.TestSuite:new()
-- Running test cases against Encoding function.
for i, v in ipairs(testCases) do
test_suite:add_test(unittest.equal(unicode.encode(decode_label(v[1]), unicode.utf8_enc), v[2]))
test_suite:add_test(unittest.equal(encode_label(unicode.decode(v[2], unicode.utf8_dec)), v[1]))
end
return _ENV
|