/
utf8.lua
276 lines (260 loc) · 6.15 KB
/
utf8.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
--UTF-8 encoding and decoding for LuaJIT
--Written by Cosmin Apreutesei. Public Domain.
if not ... then require'utf8_test'; return end
local ffi = require'ffi'
local bit = require'bit'
local band, shl, shr = bit.band, bit.lshift, bit.rshift
local utf8 = {}
local uint32_array = ffi.typeof'uint32_t[?]'
local uint8_array = ffi.typeof'uint8_t[?]'
local uint32_ptr = ffi.typeof'const uint32_t*'
local uint8_ptr = ffi.typeof'const uint8_t*'
local function tobuf(s, len, ct, sizeof_ct)
if type(s) == 'string' then
return s, ffi.cast(ct or uint8_ptr, s),
math.min(len or 1/0, #s / (sizeof_ct or 1))
else
return nil, s, len
end
end
-- byte 1 byte 2 byte 3 byte 4
--------------------------------------------
-- 00 - 7F
-- C2 - DF 80 - BF
-- E0 A0 - BF 80 - BF
-- E1 - EC 80 - BF 80 - BF
-- ED 80 - 9F 80 - BF
-- EE - EF 80 - BF 80 - BF
-- F0 90 - BF 80 - BF 80 - BF
-- F1 - F3 80 - BF 80 - BF 80 - BF
-- F4 80 - 8F 80 - BF 80 - BF
function utf8.next(buf, len, i)
if i >= len then
return nil --EOS
end
local c1 = buf[i]
i = i + 1
if c1 <= 0x7F then
return i, c1 --ASCII
elseif c1 < 0xC2 then
--invalid
elseif c1 <= 0xDF then --2-byte
if i < len then
local c2 = buf[i]
if c2 >= 0x80 and c2 <= 0xBF then
return i + 1,
shl(band(c1, 0x1F), 6)
+ band(c2, 0x3F)
end
end
elseif c1 <= 0xEF then --3-byte
if i < len + 1 then
local c2, c3 = buf[i], buf[i+1]
if not (
c2 < 0x80 or c2 > 0xBF
or c3 < 0x80 or c3 > 0xBF
or (c1 == 0xE0 and c2 < 0xA0)
or (c1 == 0xED and c2 > 0x9F)
) then
return i + 2,
shl(band(c1, 0x0F), 12)
+ shl(band(c2, 0x3F), 6)
+ band(c3, 0x3F)
end
end
elseif c1 <= 0xF4 then --4-byte
if i < len + 2 then
local c2, c3, c4 = buf[i], buf[i+1], buf[i+2]
if not (
c2 < 0x80 or c2 > 0xBF
or c3 < 0x80 or c3 > 0xBF
or c3 < 0x80 or c3 > 0xBF
or c4 < 0x80 or c4 > 0xBF
or (c1 == 0xF0 and c2 < 0x90)
or (c1 == 0xF4 and c2 > 0x8F)
) then
return i + 3,
shl(band(c1, 0x07), 18)
+ shl(band(c2, 0x3F), 12)
+ shl(band(c3, 0x3F), 6)
+ band(c4, 0x3F)
end
end
end
return i, nil, c1 --invalid
end
function utf8.prev(buf, len, i)
if i <= 0 then
return nil
end
local j = i
while i > 0 do --go back to a previous possible start byte
i = i - 1
local c = buf[i]
if c < 0x80 or c > 0xBF or i == j-4 then
break
end
end
while true do --go forward to the real previous character
local i1, c, b = utf8.next(buf, len, i)
i1 = i1 or len
if i1 == j then
return i, c, b
end
i = i1
assert(i < j)
end
end
function utf8.chars(s, i)
local _, buf, len = tobuf(s)
i = i and i-1 or 0
return function()
local c, b
i, c, b = utf8.next(buf, len, i)
if not i then return nil end
return i+1, c, b
end
end
--pass `false` to `out` to only get the output length.
--pass `nil` to `out` to have the function allocate the buffer.
function utf8.decode(buf, len, out, outlen, repl)
local _, buf, len = tobuf(buf, len)
if out == nil then
outlen = outlen or utf8.decode(buf, len, false, nil, repl)
out = uint32_array(outlen + 1)
end
outlen = outlen or 1/0
local j, p, i = 0, 0, 0
while true do
local i1, c = utf8.next(buf, len, i)
if not i1 then
break
end
if not c then
p = p + 1
if repl == 'iso-8859-1' then
c = buf[i] --interpret as iso-8859-1 like browsers do
else
c = repl
end
end
if c then
if j >= outlen then
return nil, 'overflow', i
end
if out then
out[j] = c
end
j = j + 1
end
i = i1
end
if out then
return out, j, p
else
return j, p
end
end
local function char_byte_count(c, invalid_size)
if c < 0 or c > 0x10FFFF or (c >= 0xD800 and c <= 0xDFFF) then
return invalid_size
elseif c <= 0x7F then
return 1
elseif c <= 0x7FF then
return 2
elseif c <= 0xFFFF then
return 3
else
return 4
end
end
local function byte_count(buf, len, repl)
local n = 0
local invalid_size = repl and char_byte_count(repl, 0) or 0
for i = 0, len-1 do
n = n + char_byte_count(buf[i], invalid_size)
end
return n
end
local function encode_char(c, repl)
local n, b1, b2, b3, b4 = 0
if c >= 0xD800 and c <= 0xDFFF then --surrogate pair
if repl then
return encode_char(repl)
end
elseif c <= 0x7F then
b1 = c
n = 1
elseif c <= 0x7FF then
b2 = 0x80 + band(c, 0x3F); c = shr(c, 6)
b1 = 0xC0 + c
n = 2
elseif c <= 0xFFFF then
b3 = 0x80 + band(c, 0x3F); c = shr(c, 6)
b2 = 0x80 + band(c, 0x3F); c = shr(c, 6)
b1 = 0xE0 + c
n = 3
elseif c <= 0x10FFFF then
b4 = 0x80 + band(c, 0x3F); c = shr(c, 6)
b3 = 0x80 + band(c, 0x3F); c = shr(c, 6)
b2 = 0x80 + band(c, 0x3F); c = shr(c, 6)
b1 = 0xF0 + c
n = 4
elseif repl then
return encode_char(repl)
end
return n, b1, b2, b3, b4
end
function utf8.encode(buf, len, out, outlen, repl)
local _, buf, len = tobuf(buf, len, uint32_ptr, 4)
if out == nil then --allocate output buffer
outlen = outlen or utf8.encode(buf, len, false, nil, repl)
out = uint8_array(outlen + 1)
elseif not out then --compute output length
return byte_count(buf, len, repl)
end
local j = 0
for i = 0, len-1 do
local n, b1, b2, b3, b4 = encode_char(buf[i], repl)
if n > outlen then
return nil, 'overflow'
end
if b1 then out[j ] = b1 end
if b2 then out[j+1] = b2 end
if b3 then out[j+2] = b3 end
if b4 then out[j+3] = b4 end
j = j + n
outlen = outlen - n
end
return out, j
end
function utf8.encode_chars(...)
local char = string.char
local out = {}
local t, repl = ...
if type(t) == 'table' then
local j = 1
for i = 1, #t do
local c = t[i]
local n, b1, b2, b3, b4 = encode_char(c, repl)
if b1 then out[j ] = char(b1) end
if b2 then out[j+1] = char(b2) end
if b3 then out[j+2] = char(b3) end
if b4 then out[j+3] = char(b4) end
j = j + n
end
else
local j = 1
for i = 1, select('#',...) do
local c = select(i, ...)
local n, b1, b2, b3, b4 = encode_char(c)
if b1 then out[j ] = char(b1) end
if b2 then out[j+1] = char(b2) end
if b3 then out[j+2] = char(b3) end
if b4 then out[j+3] = char(b4) end
j = j + n
end
end
return table.concat(out)
end
return utf8