1 /* ################################################################### */
2 /* Copyright 2015, Pierre Gentile (p.gen.progs@gmail.com) */
4 /* This Source Code Form is subject to the terms of the Mozilla Public */
5 /* License, v. 2.0. If a copy of the MPL was not distributed with this */
6 /* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
7 /* ################################################################### */
9 /* ************************************* */
10 /* Various UTF-8 manipulation functions. */
11 /* ************************************* */
23 /* =========================================================== */
24 /* UTF-8 byte sequence generation from a given UCS-4 codepoint */
25 /* utf8_str must be preallocated with a size of at least 5 */
27 /* return the length of the generated sequence or 0 if c is */
28 /* not a valid codepoint. */
29 /* =========================================================== */
31 cptoutf8(char *utf8_str
, uint32_t c
)
42 utf8_str
[0] = 0xC0 | ((c
>> 6) & 0x1F);
43 utf8_str
[1] = 0x80 | (c
& 0x3F);
48 utf8_str
[0] = 0xE0 | ((c
>> 12) & 0x0F);
49 utf8_str
[1] = 0x80 | ((c
>> 6) & 0x3F);
50 utf8_str
[2] = 0x80 | (c
& 0x3F);
53 else if (c
< 0x110000)
55 utf8_str
[0] = 0xF0 | ((c
>> 18) & 0x07);
56 utf8_str
[1] = 0x80 | ((c
>> 12) & 0x3F);
57 utf8_str
[2] = 0x80 | ((c
>> 6) & 0x3F);
58 utf8_str
[3] = 0x80 | (c
& 0x3F);
65 /* ======================================================================= */
66 /* Unicode (UTF-8) ASCII representation interpreter. */
67 /* The string passed will be altered but its address will not change. */
68 /* All hexadecimal sequences of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will */
69 /* be replaced by the corresponding UTF-8 character when possible. */
70 /* All hexadecimal sequences of \Uxxxxxx will be replaced with the UTF-8 */
71 /* sequence corresponding to the given UCS-4 codepoint. */
72 /* When not possible the substitution character is substituted in place. */
73 /* Returns 0 if the conversion has failed else 1. */
74 /* ======================================================================= */
76 utf8_interpret(char *s
, char substitute
)
78 char *utf8_str
; /* \uxx... */
79 size_t utf8_to_eos_len
; /* bytes in s starting from the first *
80 * occurrence of \u. */
81 size_t init_len
; /* initial lengths of the string to interpret */
82 size_t utf8_ascii_len
; /* 2,4,6 or 8 bytes. */
83 size_t len_to_remove
= 0; /* number of bytes to remove after the *
85 char tmp
[9]; /* temporary string. */
86 int rc
= 1; /* return code, 0: error, 1: fine. */
88 /* Guard against the case where s is NULL. */
89 /* """"""""""""""""""""""""""""""""""""""" */
95 /* Manage \U codepoints. */
96 /* """"""""""""""""""""" */
97 while ((utf8_str
= strstr(s
,
105 int subst
; /* 0, the \U sequence is valid, else 1. */
107 utf8_to_eos_len
= strlen(utf8_str
);
110 n
= sscanf(utf8_str
+ 2,
121 if (n
== 1 && utf8_str_len
== 6)
123 sscanf(tmp
, "%x", &cp
);
125 subst
= 1; /* Invalid range. */
131 len
= cptoutf8(str
, cp
);
133 *(utf8_str
+ 1) = 'u';
134 memmove(utf8_str
, str
, len
);
135 memmove(utf8_str
+ len
, utf8_str
+ 8, utf8_to_eos_len
- 8);
136 len_to_remove
+= 8 - len
;
140 subst
= 1; /* Invalid sequence. */
142 /* In case of invalid \U sequence, replace it with the */
143 /* substitution character. */
144 /* ''''''''''''''''''''''''''''''''''''''''''''''''''' */
147 *utf8_str
= substitute
;
148 memmove(utf8_str
+ 1,
149 utf8_str
+ 2 + utf8_str_len
,
150 utf8_to_eos_len
- (utf8_str_len
+ 2 - 1));
151 len_to_remove
+= utf8_str_len
+ 2 - 1;
155 /* Make sure that the string is well terminated. */
156 /* """"""""""""""""""""""""""""""""""""""""""""" */
157 *(s
+ init_len
- len_to_remove
) = '\0';
159 /* Manage \u UTF-8 byte sequences. */
160 /* """"""""""""""""""""""""""""""" */
161 while ((utf8_str
= strstr(s
,
166 utf8_to_eos_len
= strlen(utf8_str
);
167 if (utf8_to_eos_len
< 4) /* string too short to contain *
168 | a valid UTF-8 char. */
170 *utf8_str
= substitute
;
171 *(utf8_str
+ 1) = '\0';
174 else /* s is long enough. */
177 char *utf8_seq_offset
= utf8_str
+ 2;
179 /* Get the first 2 UTF-8 bytes. */
180 /* """""""""""""""""""""""""""" */
181 *tmp
= *utf8_seq_offset
;
182 *(tmp
+ 1) = *(utf8_seq_offset
+ 1);
185 /* If they are invalid, replace the \u sequence by the */
186 /* substitute character. */
187 /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
188 if (!isxdigit(tmp
[0]) || !isxdigit(tmp
[1]))
190 *utf8_str
= substitute
;
191 if (4 >= utf8_to_eos_len
)
192 *(utf8_str
+ 1) = '\0';
195 /* Do not forget the training \0. */
196 /* """""""""""""""""""""""""""""" */
197 memmove(utf8_str
+ 1, utf8_str
+ 4, utf8_to_eos_len
- 4 + 1);
205 char b
[3] = { ' ', ' ', '\0' };
207 /* They are valid, deduce from them the length of the sequence. */
208 /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
209 sscanf(tmp
, "%2x", &byte
);
211 utf8_ascii_len
= utf8_get_length(byte
) * 2;
213 /* replace the \u sequence by the bytes forming the UTF-8 char. */
214 /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
216 /* Put the bytes in the tmp string. */
217 /* '''''''''''''''''''''''''''''''' */
218 *tmp
= byte
; /* Reuse the tmp array. */
220 for (i
= 1; i
< utf8_ascii_len
/ 2; i
++)
225 n
= sscanf(utf8_seq_offset
+ 2 * i
, "%c%c", &b
[0], &b
[1]);
231 sscanf(b
, "%x%c", &byte
, &end
);
233 if (byte
== 0 || end
!= '\0' || (byte
& 0xc0) != 0x80)
242 utf8_ascii_len
= 2 * i
; /* Force the new length according to the *
243 | number of valid UTF-8 bytes read. */
245 tmp
[utf8_ascii_len
/ 2] = '\0';
247 /* Does they form a valid UTF-8 char? */
248 /* '''''''''''''''''''''''''''''''''' */
249 if (utf8_validate(tmp
) == NULL
)
251 /* Put them back in the original string and move */
252 /* the remaining bytes after them. */
253 /* ''''''''''''''''''''''''''''''''''''''''''''' */
254 memmove(utf8_str
, tmp
, utf8_ascii_len
/ 2);
256 if (utf8_to_eos_len
< utf8_ascii_len
)
257 *(utf8_str
+ utf8_ascii_len
/ 2 + 1) = '\0';
259 memmove(utf8_str
+ utf8_ascii_len
/ 2,
260 utf8_seq_offset
+ utf8_ascii_len
,
261 utf8_to_eos_len
- utf8_ascii_len
- 2 + 1);
265 /* The invalid sequence is replaced by a */
266 /* substitution character. */
267 /* ''''''''''''''''''''''''''''''''''''' */
268 *utf8_str
= substitute
;
270 if (utf8_to_eos_len
< utf8_ascii_len
)
271 *(utf8_str
+ 1) = '\0';
273 memmove(utf8_str
+ 1,
274 utf8_seq_offset
+ utf8_ascii_len
,
275 utf8_to_eos_len
- utf8_ascii_len
- 2 + 1);
281 /* Update the number of bytes to remove at the end */
282 /* of the initial string. */
283 /* """"""""""""""""""""""""""""""""""""""""""""""" */
284 len_to_remove
+= 2 + utf8_ascii_len
/ 2;
289 /* Make sure that the string is well terminated. */
290 /* """"""""""""""""""""""""""""""""""""""""""""" */
291 *(s
+ init_len
- len_to_remove
) = '\0';
296 /* ========================================================= */
297 /* Decodes the number of bytes taken by a UTF-8 glyph. */
298 /* It is the length of the leading sequence of bits set to 1 */
299 /* in the first byte. */
300 /* ========================================================= */
302 utf8_get_length(unsigned char c
)
314 /* ==================================================== */
315 /* Returns the byte offset of the nth UTF-8 glyph in s. */
316 /* ==================================================== */
318 utf8_offset(char const *s
, size_t n
)
326 (void)(((s
[++i
] & 0xc0) != 0x80) || ((s
[++i
] & 0xc0) != 0x80) || ++i
);
333 /* ============================================== */
334 /* Points to the previous UTF-8 glyph in a string */
335 /* from the given position. */
336 /* ============================================== */
338 utf8_prev(const char *str
, const char *p
)
340 while ((*p
& 0xc0) == 0x80)
343 for (--p
; p
>= str
; --p
)
345 if ((*p
& 0xc0) != 0x80)
351 /* ========================================== */
352 /* Points to the next UTF-8 glyph in a string */
353 /* from the current position. */
354 /* ========================================== */
360 for (++p
; (*p
& 0xc0) == 0x80; ++p
)
364 return *p
== '\0' ? NULL
: p
;
367 /* ============================================================= */
368 /* Replaces any UTF-8 glyph present in s by a substitution */
369 /* character in-place. */
370 /* s will be modified but its address in memory will not change. */
371 /* ============================================================= */
373 utf8_sanitize(char *s
, char substitute
)
383 n
= utf8_get_length(*p
);
388 memmove(p
+ 1, p
+ n
, len
- (p
- s
) - n
+ 1);
396 /* ======================================================================= */
397 /* This function scans the '\0'-terminated string starting at s. */
398 /* It returns a pointer to the first byte of the first malformed */
399 /* or overlong UTF-8 sequence found, or NULL if the string contains only */
401 /* It also spots UTF-8 sequences that could cause trouble if converted to */
402 /* UTF-16, namely surrogate characters (U+D800..U+DFFF) and non-Unicode */
403 /* positions (U+FFFE..U+FFFF). */
404 /* This routine is very likely to find a malformed sequence if the input */
405 /* uses any other encoding than UTF-8. */
406 /* It therefore can be used as a very effective heuristic for */
407 /* distinguishing between UTF-8 and other encodings. */
409 /* I wrote this code mainly as a specification of functionality; there */
410 /* are no doubt performance optimizations possible for certain CPUs. */
412 /* Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30 */
413 /* License: http://www.cl.cam.ac.uk/~mgk25/short-license.html */
414 /* ======================================================================= */
416 utf8_validate(char *s
)
418 unsigned char *us
= (unsigned char *)s
;
420 /* clang-format off */
426 else if ((us
[0] & 0xe0) == 0xc0)
428 /* 110XXXXx 10xxxxxx */
429 if ((us
[1] & 0xc0) != 0x80 || (us
[0] & 0xfe) == 0xc0) /* overlong? */
434 else if ((us
[0] & 0xf0) == 0xe0)
436 /* 1110XXXX 10Xxxxxx 10xxxxxx */
437 if ((us
[1] & 0xc0) != 0x80 ||
438 (us
[2] & 0xc0) != 0x80 ||
439 (us
[0] == 0xe0 && (us
[1] & 0xe0) == 0x80) || /* overlong? */
440 (us
[0] == 0xed && (us
[1] & 0xe0) == 0xa0) || /* surrogate? */
441 (us
[0] == 0xef && us
[1] == 0xbf &&
442 (us
[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
447 else if ((us
[0] & 0xf8) == 0xf0)
449 /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
450 if ((us
[1] & 0xc0) != 0x80 ||
451 (us
[2] & 0xc0) != 0x80 ||
452 (us
[3] & 0xc0) != 0x80 ||
453 (us
[0] == 0xf0 && (us
[1] & 0xf0) == 0x80) || /* overlong? */
454 (us
[0] == 0xf4 && us
[1] > 0x8f) || us
[0] > 0xf4) /* > U+10FFFF? */
462 /* clang-format on */
467 /* ======================= */
468 /* Multibyte UTF-8 strlen. */
469 /* ======================= */
471 utf8_strlen(char const *str
)
477 if ((str
[i
] & 0xc0) != 0x80)
484 /* ==================================================================== */
485 /* Multibytes extraction of the prefix of n UTF-8 glyphs from a string. */
486 /* The destination string d must have been allocated before. */
487 /* pos is updated to reflect the position AFTER the prefix. */
488 /* ==================================================================== */
490 utf8_strprefix(char *d
, char const *s
, long n
, long *pos
)
497 while (s
[i
] && j
< n
)
502 while (s
[i
] && (s
[i
] & 0xC0) == 0x80)
516 /* ================================================== */
517 /* Converts a UTF-8 glyph string to a wchar_t string. */
518 /* The returned string must be freed by the caller. */
519 /* ================================================== */
521 utf8_strtowcs(char *s
)
528 size
= (long)strlen(s
);
529 w
= xmalloc((size
+ 1) * sizeof(wchar_t));
533 for (ch
= (unsigned char *)s
; *ch
; ch
+= converted
)
535 if ((converted
= mbtowc(wptr
, (char *)ch
, 4)) > 0)
539 *wptr
++ = (wchar_t)*ch
;
549 /* ============================================================== */
550 /* Poor man UTF-8 aware strtolower version. */
551 /* Replaces all ASCII characters in src by its lowercase version. */
552 /* dst must be preallocated before the call. */
553 /* ============================================================== */
555 utf8_strtolower(char *dst
, char *src
)