Take UTF-8 spaces into account for empty strings
[smenu.git] / utf8.c
blob0f6e97df475f9c48344df9292566bd07208ea451
1 /* ################################################################### */
2 /* Copyright 2015, Pierre Gentile (p.gen.progs@gmail.com) */
3 /* */
4 /* This Source Code Form is subject to the terms of the Mozilla Public */
5 /* License, v. 2.0. If a copy of the MPL was not distributed with this */
6 /* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
7 /* ################################################################### */
9 /* ************************************* */
10 /* Various UTF-8 manipulation functions. */
11 /* ************************************* */
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <stdarg.h>
16 #include <string.h>
17 #include <ctype.h>
18 #include <limits.h>
19 #include <langinfo.h>
20 #include "xmalloc.h"
21 #include "utf8.h"
23 /* =========================================================== */
24 /* UTF-8 byte sequence generation from a given UCS-4 codepoint */
25 /* utf8_str must be preallocated with a size of at least 5 */
26 /* bytes. */
27 /* return the length of the generated sequence or 0 if c is */
28 /* not a valid codepoint. */
29 /* =========================================================== */
30 int
31 cptoutf8(char *utf8_str, uint32_t c)
33 int len = 0;
35 if (c < 0x80)
37 utf8_str[0] = c;
38 len = 1;
40 else if (c < 0x800)
42 utf8_str[0] = 0xC0 | ((c >> 6) & 0x1F);
43 utf8_str[1] = 0x80 | (c & 0x3F);
44 len = 2;
46 else if (c < 0x10000)
48 utf8_str[0] = 0xE0 | ((c >> 12) & 0x0F);
49 utf8_str[1] = 0x80 | ((c >> 6) & 0x3F);
50 utf8_str[2] = 0x80 | (c & 0x3F);
51 len = 3;
53 else if (c < 0x110000)
55 utf8_str[0] = 0xF0 | ((c >> 18) & 0x07);
56 utf8_str[1] = 0x80 | ((c >> 12) & 0x3F);
57 utf8_str[2] = 0x80 | ((c >> 6) & 0x3F);
58 utf8_str[3] = 0x80 | (c & 0x3F);
59 len = 4;
62 return len;
65 /* ======================================================================= */
66 /* Unicode (UTF-8) ASCII representation interpreter. */
67 /* The string passed will be altered but its address will not change. */
68 /* All hexadecimal sequences of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will */
69 /* be replaced by the corresponding UTF-8 character when possible. */
70 /* All hexadecimal sequences of \Uxxxxxx will be replaced with the UTF-8 */
71 /* sequence corresponding to the given UCS-4 codepoint. */
72 /* When not possible the substitution character is substituted in place. */
73 /* Returns 0 if the conversion has failed else 1. */
74 /* ======================================================================= */
75 int
76 utf8_interpret(char *s, char substitute)
78 char *utf8_str; /* \uxx... */
79 size_t utf8_to_eos_len; /* bytes in s starting from the first *
80 * occurrence of \u. */
81 size_t init_len; /* initial lengths of the string to interpret */
82 size_t utf8_ascii_len; /* 2,4,6 or 8 bytes. */
83 size_t len_to_remove = 0; /* number of bytes to remove after the *
84 | conversion. */
85 char tmp[9]; /* temporary string. */
86 int rc = 1; /* return code, 0: error, 1: fine. */
88 /* Guard against the case where s is NULL. */
89 /* """"""""""""""""""""""""""""""""""""""" */
90 if (s == NULL)
91 return 0;
93 init_len = strlen(s);
95 /* Manage \U codepoints. */
96 /* """"""""""""""""""""" */
97 while ((utf8_str = strstr(s,
98 "\\"
99 "U"))
100 != NULL)
102 int utf8_str_len;
103 int n;
104 uint32_t cp;
105 int subst; /* 0, the \U sequence is valid, else 1. */
107 utf8_to_eos_len = strlen(utf8_str);
108 utf8_str_len = 0;
110 n = sscanf(utf8_str + 2,
111 "%6["
112 "0123456789"
113 "abcdef"
114 "ABCDEF"
115 "]%n",
116 tmp,
117 &utf8_str_len);
119 subst = 0;
121 if (n == 1 && utf8_str_len == 6)
123 sscanf(tmp, "%x", &cp);
124 if (cp > 0x10FFFF)
125 subst = 1; /* Invalid range. */
126 else
128 char str[7];
129 int len;
131 len = cptoutf8(str, cp);
132 str[len] = '\0';
133 *(utf8_str + 1) = 'u';
134 memmove(utf8_str, str, len);
135 memmove(utf8_str + len, utf8_str + 8, utf8_to_eos_len - 8);
136 len_to_remove += 8 - len;
139 else
140 subst = 1; /* Invalid sequence. */
142 /* In case of invalid \U sequence, replace it with the */
143 /* substitution character. */
144 /* ''''''''''''''''''''''''''''''''''''''''''''''''''' */
145 if (subst)
147 *utf8_str = substitute;
148 memmove(utf8_str + 1,
149 utf8_str + 2 + utf8_str_len,
150 utf8_to_eos_len - (utf8_str_len + 2 - 1));
151 len_to_remove += utf8_str_len + 2 - 1;
155 /* Make sure that the string is well terminated. */
156 /* """"""""""""""""""""""""""""""""""""""""""""" */
157 *(s + init_len - len_to_remove) = '\0';
159 /* Manage \u UTF-8 byte sequences. */
160 /* """"""""""""""""""""""""""""""" */
161 while ((utf8_str = strstr(s,
162 "\\"
163 "u"))
164 != NULL)
166 utf8_to_eos_len = strlen(utf8_str);
167 if (utf8_to_eos_len < 4) /* string too short to contain *
168 | a valid UTF-8 char. */
170 *utf8_str = substitute;
171 *(utf8_str + 1) = '\0';
172 rc = 0;
174 else /* s is long enough. */
176 unsigned byte;
177 char *utf8_seq_offset = utf8_str + 2;
179 /* Get the first 2 UTF-8 bytes. */
180 /* """""""""""""""""""""""""""" */
181 *tmp = *utf8_seq_offset;
182 *(tmp + 1) = *(utf8_seq_offset + 1);
183 *(tmp + 2) = '\0';
185 /* If they are invalid, replace the \u sequence by the */
186 /* substitute character. */
187 /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
188 if (!isxdigit(tmp[0]) || !isxdigit(tmp[1]))
190 *utf8_str = substitute;
191 if (4 >= utf8_to_eos_len)
192 *(utf8_str + 1) = '\0';
193 else
195 /* Do not forget the training \0. */
196 /* """""""""""""""""""""""""""""" */
197 memmove(utf8_str + 1, utf8_str + 4, utf8_to_eos_len - 4 + 1);
199 rc = 0;
201 else
203 char end;
204 size_t i;
205 char b[3] = { ' ', ' ', '\0' };
207 /* They are valid, deduce from them the length of the sequence. */
208 /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
209 sscanf(tmp, "%2x", &byte);
211 utf8_ascii_len = utf8_get_length(byte) * 2;
213 /* replace the \u sequence by the bytes forming the UTF-8 char. */
214 /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
216 /* Put the bytes in the tmp string. */
217 /* '''''''''''''''''''''''''''''''' */
218 *tmp = byte; /* Reuse the tmp array. */
220 for (i = 1; i < utf8_ascii_len / 2; i++)
222 int good = 1;
223 int n;
225 n = sscanf(utf8_seq_offset + 2 * i, "%c%c", &b[0], &b[1]);
227 if (n == 2)
229 byte = 0;
230 end = '\0';
231 sscanf(b, "%x%c", &byte, &end);
233 if (byte == 0 || end != '\0' || (byte & 0xc0) != 0x80)
234 good = 0;
236 else
237 good = 0;
239 if (good)
240 *(tmp + i) = byte;
241 else
242 utf8_ascii_len = 2 * i; /* Force the new length according to the *
243 | number of valid UTF-8 bytes read. */
245 tmp[utf8_ascii_len / 2] = '\0';
247 /* Does they form a valid UTF-8 char? */
248 /* '''''''''''''''''''''''''''''''''' */
249 if (utf8_validate(tmp) == NULL)
251 /* Put them back in the original string and move */
252 /* the remaining bytes after them. */
253 /* ''''''''''''''''''''''''''''''''''''''''''''' */
254 memmove(utf8_str, tmp, utf8_ascii_len / 2);
256 if (utf8_to_eos_len < utf8_ascii_len)
257 *(utf8_str + utf8_ascii_len / 2 + 1) = '\0';
258 else
259 memmove(utf8_str + utf8_ascii_len / 2,
260 utf8_seq_offset + utf8_ascii_len,
261 utf8_to_eos_len - utf8_ascii_len - 2 + 1);
263 else
265 /* The invalid sequence is replaced by a */
266 /* substitution character. */
267 /* ''''''''''''''''''''''''''''''''''''' */
268 *utf8_str = substitute;
270 if (utf8_to_eos_len < utf8_ascii_len)
271 *(utf8_str + 1) = '\0';
272 else
273 memmove(utf8_str + 1,
274 utf8_seq_offset + utf8_ascii_len,
275 utf8_to_eos_len - utf8_ascii_len - 2 + 1);
277 utf8_ascii_len = 2;
278 rc = 0;
281 /* Update the number of bytes to remove at the end */
282 /* of the initial string. */
283 /* """"""""""""""""""""""""""""""""""""""""""""""" */
284 len_to_remove += 2 + utf8_ascii_len / 2;
289 /* Make sure that the string is well terminated. */
290 /* """"""""""""""""""""""""""""""""""""""""""""" */
291 *(s + init_len - len_to_remove) = '\0';
293 return rc;
296 /* ========================================================= */
297 /* Decodes the number of bytes taken by a UTF-8 glyph. */
298 /* It is the length of the leading sequence of bits set to 1 */
299 /* in the first byte. */
300 /* ========================================================= */
302 utf8_get_length(unsigned char c)
304 if (c < 0x80)
305 return 1;
306 else if (c < 0xe0)
307 return 2;
308 else if (c < 0xf0)
309 return 3;
311 return 4;
314 /* ==================================================== */
315 /* Returns the byte offset of the nth UTF-8 glyph in s. */
316 /* ==================================================== */
317 size_t
318 utf8_offset(char const *s, size_t n)
320 size_t i = 0;
322 while (n > 0)
324 if (s[i++] & 0x80)
326 (void)(((s[++i] & 0xc0) != 0x80) || ((s[++i] & 0xc0) != 0x80) || ++i);
328 n--;
330 return i;
333 /* ============================================== */
334 /* Points to the previous UTF-8 glyph in a string */
335 /* from the given position. */
336 /* ============================================== */
337 char *
338 utf8_prev(const char *str, const char *p)
340 while ((*p & 0xc0) == 0x80)
341 p--;
343 for (--p; p >= str; --p)
345 if ((*p & 0xc0) != 0x80)
346 return (char *)p;
348 return NULL;
351 /* ========================================== */
352 /* Points to the next UTF-8 glyph in a string */
353 /* from the current position. */
354 /* ========================================== */
355 char *
356 utf8_next(char *p)
358 if (*p)
360 for (++p; (*p & 0xc0) == 0x80; ++p)
364 return *p == '\0' ? NULL : p;
367 /* ============================================================= */
368 /* Replaces any UTF-8 glyph present in s by a substitution */
369 /* character in-place. */
370 /* s will be modified but its address in memory will not change. */
371 /* ============================================================= */
372 void
373 utf8_sanitize(char *s, char substitute)
375 char *p = s;
376 size_t len;
378 len = strlen(s);
379 while (*p)
381 int n;
383 n = utf8_get_length(*p);
385 if (n > 1)
387 *p = substitute;
388 memmove(p + 1, p + n, len - (p - s) - n + 1);
389 len -= (n - 1);
392 p++;
396 /* ======================================================================= */
397 /* This function scans the '\0'-terminated string starting at s. */
398 /* It returns a pointer to the first byte of the first malformed */
399 /* or overlong UTF-8 sequence found, or NULL if the string contains only */
400 /* correct UTF-8. */
401 /* It also spots UTF-8 sequences that could cause trouble if converted to */
402 /* UTF-16, namely surrogate characters (U+D800..U+DFFF) and non-Unicode */
403 /* positions (U+FFFE..U+FFFF). */
404 /* This routine is very likely to find a malformed sequence if the input */
405 /* uses any other encoding than UTF-8. */
406 /* It therefore can be used as a very effective heuristic for */
407 /* distinguishing between UTF-8 and other encodings. */
408 /* */
409 /* I wrote this code mainly as a specification of functionality; there */
410 /* are no doubt performance optimizations possible for certain CPUs. */
411 /* */
412 /* Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30 */
413 /* License: http://www.cl.cam.ac.uk/~mgk25/short-license.html */
414 /* ======================================================================= */
415 char *
416 utf8_validate(char *s)
418 unsigned char *us = (unsigned char *)s;
420 /* clang-format off */
421 while (*us)
423 if (*us < 0x80)
424 /* 0xxxxxxx */
425 us++;
426 else if ((us[0] & 0xe0) == 0xc0)
428 /* 110XXXXx 10xxxxxx */
429 if ((us[1] & 0xc0) != 0x80 || (us[0] & 0xfe) == 0xc0) /* overlong? */
430 return (char *)us;
432 us += 2;
434 else if ((us[0] & 0xf0) == 0xe0)
436 /* 1110XXXX 10Xxxxxx 10xxxxxx */
437 if ((us[1] & 0xc0) != 0x80 ||
438 (us[2] & 0xc0) != 0x80 ||
439 (us[0] == 0xe0 && (us[1] & 0xe0) == 0x80) || /* overlong? */
440 (us[0] == 0xed && (us[1] & 0xe0) == 0xa0) || /* surrogate? */
441 (us[0] == 0xef && us[1] == 0xbf &&
442 (us[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
443 return (char *)us;
445 us += 3;
447 else if ((us[0] & 0xf8) == 0xf0)
449 /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
450 if ((us[1] & 0xc0) != 0x80 ||
451 (us[2] & 0xc0) != 0x80 ||
452 (us[3] & 0xc0) != 0x80 ||
453 (us[0] == 0xf0 && (us[1] & 0xf0) == 0x80) || /* overlong? */
454 (us[0] == 0xf4 && us[1] > 0x8f) || us[0] > 0xf4) /* > U+10FFFF? */
455 return (char *)us;
457 us += 4;
459 else
460 return (char *)us;
462 /* clang-format on */
464 return NULL;
467 /* ======================= */
468 /* Multibyte UTF-8 strlen. */
469 /* ======================= */
470 size_t
471 utf8_strlen(char const *str)
473 size_t i = 0, j = 0;
475 while (str[i])
477 if ((str[i] & 0xc0) != 0x80)
478 j++;
479 i++;
481 return j;
484 /* ==================================================================== */
485 /* Multibytes extraction of the prefix of n UTF-8 glyphs from a string. */
486 /* The destination string d must have been allocated before. */
487 /* pos is updated to reflect the position AFTER the prefix. */
488 /* ==================================================================== */
489 char *
490 utf8_strprefix(char *d, char const *s, long n, long *pos)
492 long i = 0;
493 long j = 0;
495 *pos = 0;
497 while (s[i] && j < n)
499 d[i] = s[i];
500 i++;
501 j++;
502 while (s[i] && (s[i] & 0xC0) == 0x80)
504 d[i] = s[i];
505 i++;
509 *pos = i;
511 d[i] = '\0';
513 return d;
516 /* ================================================== */
517 /* Converts a UTF-8 glyph string to a wchar_t string. */
518 /* The returned string must be freed by the caller. */
519 /* ================================================== */
520 wchar_t *
521 utf8_strtowcs(char *s)
523 int converted = 0;
524 unsigned char *ch;
525 wchar_t *wptr, *w;
526 size_t size;
528 size = (long)strlen(s);
529 w = xmalloc((size + 1) * sizeof(wchar_t));
530 w[0] = L'\0';
532 wptr = w;
533 for (ch = (unsigned char *)s; *ch; ch += converted)
535 if ((converted = mbtowc(wptr, (char *)ch, 4)) > 0)
536 wptr++;
537 else
539 *wptr++ = (wchar_t)*ch;
540 converted = 1;
544 *wptr = L'\0';
546 return w;
549 /* ============================================================== */
550 /* Poor man UTF-8 aware strtolower version. */
551 /* Replaces all ASCII characters in src by its lowercase version. */
552 /* dst must be preallocated before the call. */
553 /* ============================================================== */
554 void
555 utf8_strtolower(char *dst, char *src)
557 unsigned char c;
559 while ((c = *src))
561 if (c >= 0x80)
562 *dst = c;
563 else
564 *dst = tolower(c);
566 src++;
567 dst++;
570 *dst = '\0';