utf8.c

   1 /* ################################################################### */
   2 /* Copyright 2015, Pierre Gentile (p.gen.progs@gmail.com)              */
   3 /*                                                                     */
   4 /* This Source Code Form is subject to the terms of the Mozilla Public */
   5 /* License, v. 2.0. If a copy of the MPL was not distributed with this */
   6 /* file, You can obtain one at https://mozilla.org/MPL/2.0/.           */
   7 /* ################################################################### */
   8
   9 /* ************************************* */
  10 /* Various UTF-8 manipulation functions. */
  11 /* ************************************* */
  12
  13 #include <stdio.h>
  14 #include <stdlib.h>
  15 #include <stdarg.h>
  16 #include <string.h>
  17 #include <ctype.h>
  18 #include <limits.h>
  19 #include <langinfo.h>
  20 #include "xmalloc.h"
  21 #include "utf8.h"
  22
  23 /* =========================================================== */
  24 /* UTF-8 byte sequence generation from a given UCS-4 codepoint */
  25 /* utf8_str must be preallocated with a size of at least 5     */
  26 /* bytes.                                                      */
  27 /* return the length of the generated sequence or 0 if c is    */
  28 /* not a valid codepoint.                                      */
  29 /* =========================================================== */
  30 int
  31 cptoutf8(char *utf8_str, uint32_t c)
  32 {
  33   int len = 0;
  34
  35   if (c < 0x80)
  36   {
  37     utf8_str[0] = c;
  38     len         = 1;
  39   }
  40   else if (c < 0x800)
  41   {
  42     utf8_str[0] = 0xC0 | ((c >> 6) & 0x1F);
  43     utf8_str[1] = 0x80 | (c & 0x3F);
  44     len         = 2;
  45   }
  46   else if (c < 0x10000)
  47   {
  48     utf8_str[0] = 0xE0 | ((c >> 12) & 0x0F);
  49     utf8_str[1] = 0x80 | ((c >> 6) & 0x3F);
  50     utf8_str[2] = 0x80 | (c & 0x3F);
  51     len         = 3;
  52   }
  53   else if (c < 0x110000)
  54   {
  55     utf8_str[0] = 0xF0 | ((c >> 18) & 0x07);
  56     utf8_str[1] = 0x80 | ((c >> 12) & 0x3F);
  57     utf8_str[2] = 0x80 | ((c >> 6) & 0x3F);
  58     utf8_str[3] = 0x80 | (c & 0x3F);
  59     len         = 4;
  60   }
  61
  62   return len;
  63 }
  64
  65 /* ======================================================================= */
  66 /* Unicode (UTF-8) ASCII representation interpreter.                       */
  67 /* The string passed will be altered but its address will not change.      */
  68 /* All hexadecimal sequences of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will */
  69 /* be replaced by the corresponding UTF-8 character when possible.         */
  70 /* All hexadecimal sequences of \Uxxxxxx will be replaced with the UTF-8   */
  71 /* sequence corresponding to the given UCS-4 codepoint.                    */
  72 /* When not possible the substitution character is substituted in place.   */
  73 /* Returns 0 if the conversion has failed else 1.                          */
  74 /* ======================================================================= */
  75 int
  76 utf8_interpret(char *s, char substitute)
  77 {
  78   char  *utf8_str;          /* \uxx...                                     */
  79   size_t utf8_to_eos_len;   /* bytes in s starting from the first          *
  80                              * occurrence of \u.                           */
  81   size_t init_len;          /* initial lengths of the string to interpret  */
  82   size_t utf8_ascii_len;    /* 2,4,6 or 8 bytes.                           */
  83   size_t len_to_remove = 0; /* number of bytes to remove after the         *
  84                              | conversion.                                 */
  85   char   tmp[9];            /* temporary string.                           */
  86   int    rc = 1;            /* return code, 0: error, 1: fine.             */
  87
  88   /* Guard against the case where s is NULL. */
  89   /* """"""""""""""""""""""""""""""""""""""" */
  90   if (s == NULL)
  91     return 0;
  92
  93   init_len = strlen(s);
  94
  95   /* Manage \U codepoints. */
  96   /* """"""""""""""""""""" */
  97   while ((utf8_str = strstr(s,
  98                             "\\"
  99                             "U"))
 100          != NULL)
 101   {
 102     int      utf8_str_len;
 103     int      n;
 104     uint32_t cp;
 105     int      subst; /* 0, the \U sequence is valid, else 1. */
 106
 107     utf8_to_eos_len = strlen(utf8_str);
 108     utf8_str_len    = 0;
 109
 110     n = sscanf(utf8_str + 2,
 111                "%6["
 112                "0123456789"
 113                "abcdef"
 114                "ABCDEF"
 115                "]%n",
 116                tmp,
 117                &utf8_str_len);
 118
 119     subst = 0;
 120
 121     if (n == 1 && utf8_str_len == 6)
 122     {
 123       sscanf(tmp, "%x", &cp);
 124       if (cp > 0x10FFFF)
 125         subst = 1; /* Invalid range. */
 126       else
 127       {
 128         char str[7];
 129         int  len;
 130
 131         len             = cptoutf8(str, cp);
 132         str[len]        = '\0';
 133         *(utf8_str + 1) = 'u';
 134         memmove(utf8_str, str, len);
 135         memmove(utf8_str + len, utf8_str + 8, utf8_to_eos_len - 8);
 136         len_to_remove += 8 - len;
 137       }
 138     }
 139     else
 140       subst = 1; /* Invalid sequence. */
 141
 142     /* In case of invalid \U sequence, replace it with the */
 143     /* substitution character.                             */
 144     /* ''''''''''''''''''''''''''''''''''''''''''''''''''' */
 145     if (subst)
 146     {
 147       *utf8_str = substitute;
 148       memmove(utf8_str + 1,
 149               utf8_str + 2 + utf8_str_len,
 150               utf8_to_eos_len - (utf8_str_len + 2 - 1));
 151       len_to_remove += utf8_str_len + 2 - 1;
 152     }
 153   }
 154
 155   /* Make sure that the string is well terminated. */
 156   /* """"""""""""""""""""""""""""""""""""""""""""" */
 157   *(s + init_len - len_to_remove) = '\0';
 158
 159   /* Manage \u UTF-8 byte sequences. */
 160   /* """"""""""""""""""""""""""""""" */
 161   while ((utf8_str = strstr(s,
 162                             "\\"
 163                             "u"))
 164          != NULL)
 165   {
 166     utf8_to_eos_len = strlen(utf8_str);
 167     if (utf8_to_eos_len < 4) /* string too short to contain *
 168                               | a valid UTF-8 char.         */
 169     {
 170       *utf8_str       = substitute;
 171       *(utf8_str + 1) = '\0';
 172       rc              = 0;
 173     }
 174     else /* s is long enough. */
 175     {
 176       unsigned byte;
 177       char    *utf8_seq_offset = utf8_str + 2;
 178
 179       /* Get the first 2 UTF-8 bytes. */
 180       /* """""""""""""""""""""""""""" */
 181       *tmp       = *utf8_seq_offset;
 182       *(tmp + 1) = *(utf8_seq_offset + 1);
 183       *(tmp + 2) = '\0';
 184
 185       /* If they are invalid, replace the \u sequence by the */
 186       /* substitute character.                               */
 187       /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
 188       if (!isxdigit(tmp[0]) || !isxdigit(tmp[1]))
 189       {
 190         *utf8_str = substitute;
 191         if (4 >= utf8_to_eos_len)
 192           *(utf8_str + 1) = '\0';
 193         else
 194         {
 195           /* Do not forget the training \0. */
 196           /* """""""""""""""""""""""""""""" */
 197           memmove(utf8_str + 1, utf8_str + 4, utf8_to_eos_len - 4 + 1);
 198         }
 199         rc = 0;
 200       }
 201       else
 202       {
 203         char   end;
 204         size_t i;
 205         char   b[3] = { ' ', ' ', '\0' };
 206
 207         /* They are valid, deduce from them the length of the sequence. */
 208         /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
 209         sscanf(tmp, "%2x", &byte);
 210
 211         utf8_ascii_len = utf8_get_length(byte) * 2;
 212
 213         /* replace the \u sequence by the bytes forming the UTF-8 char. */
 214         /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
 215
 216         /* Put the bytes in the tmp string. */
 217         /* '''''''''''''''''''''''''''''''' */
 218         *tmp = byte; /* Reuse the tmp array. */
 219
 220         for (i = 1; i < utf8_ascii_len / 2; i++)
 221         {
 222           int good = 1;
 223           int n;
 224
 225           n = sscanf(utf8_seq_offset + 2 * i, "%c%c", &b[0], &b[1]);
 226
 227           if (n == 2)
 228           {
 229             byte = 0;
 230             end  = '\0';
 231             sscanf(b, "%x%c", &byte, &end);
 232
 233             if (byte == 0 || end != '\0' || (byte & 0xc0) != 0x80)
 234               good = 0;
 235           }
 236           else
 237             good = 0;
 238
 239           if (good)
 240             *(tmp + i) = byte;
 241           else
 242             utf8_ascii_len = 2 * i; /* Force the new length according to the *
 243                                      | number of valid UTF-8 bytes read.     */
 244         }
 245         tmp[utf8_ascii_len / 2] = '\0';
 246
 247         /* Does they form a valid UTF-8 char? */
 248         /* '''''''''''''''''''''''''''''''''' */
 249         if (utf8_validate(tmp) == NULL)
 250         {
 251           /* Put them back in the original string and move */
 252           /* the remaining bytes after them.               */
 253           /* ''''''''''''''''''''''''''''''''''''''''''''' */
 254           memmove(utf8_str, tmp, utf8_ascii_len / 2);
 255
 256           if (utf8_to_eos_len < utf8_ascii_len)
 257             *(utf8_str + utf8_ascii_len / 2 + 1) = '\0';
 258           else
 259             memmove(utf8_str + utf8_ascii_len / 2,
 260                     utf8_seq_offset + utf8_ascii_len,
 261                     utf8_to_eos_len - utf8_ascii_len - 2 + 1);
 262         }
 263         else
 264         {
 265           /* The invalid sequence is replaced by a */
 266           /* substitution character.               */
 267           /* ''''''''''''''''''''''''''''''''''''' */
 268           *utf8_str = substitute;
 269
 270           if (utf8_to_eos_len < utf8_ascii_len)
 271             *(utf8_str + 1) = '\0';
 272           else
 273             memmove(utf8_str + 1,
 274                     utf8_seq_offset + utf8_ascii_len,
 275                     utf8_to_eos_len - utf8_ascii_len - 2 + 1);
 276
 277           utf8_ascii_len = 2;
 278           rc             = 0;
 279         }
 280
 281         /* Update the number of bytes to remove at the end */
 282         /* of the initial string.                          */
 283         /* """"""""""""""""""""""""""""""""""""""""""""""" */
 284         len_to_remove += 2 + utf8_ascii_len / 2;
 285       }
 286     }
 287   }
 288
 289   /* Make sure that the string is well terminated. */
 290   /* """"""""""""""""""""""""""""""""""""""""""""" */
 291   *(s + init_len - len_to_remove) = '\0';
 292
 293   return rc;
 294 }
 295
 296 /* ========================================================= */
 297 /* Decodes the number of bytes taken by a UTF-8 glyph.       */
 298 /* It is the length of the leading sequence of bits set to 1 */
 299 /* in the first byte.                                        */
 300 /* ========================================================= */
 301 int
 302 utf8_get_length(unsigned char c)
 303 {
 304   if (c < 0x80)
 305     return 1;
 306   else if (c < 0xe0)
 307     return 2;
 308   else if (c < 0xf0)
 309     return 3;
 310
 311   return 4;
 312 }
 313
 314 /* ==================================================== */
 315 /* Returns the byte offset of the nth UTF-8 glyph in s. */
 316 /* ==================================================== */
 317 size_t
 318 utf8_offset(char const *s, size_t n)
 319 {
 320   size_t i = 0;
 321
 322   while (n > 0)
 323   {
 324     if (s[i++] & 0x80)
 325     {
 326       (void)(((s[++i] & 0xc0) != 0x80) || ((s[++i] & 0xc0) != 0x80) || ++i);
 327     }
 328     n--;
 329   }
 330   return i;
 331 }
 332
 333 /* ============================================== */
 334 /* Points to the previous UTF-8 glyph in a string */
 335 /* from the given position.                       */
 336 /* ============================================== */
 337 char *
 338 utf8_prev(const char *str, const char *p)
 339 {
 340   while ((*p & 0xc0) == 0x80)
 341     p--;
 342
 343   for (--p; p >= str; --p)
 344   {
 345     if ((*p & 0xc0) != 0x80)
 346       return (char *)p;
 347   }
 348   return NULL;
 349 }
 350
 351 /* ========================================== */
 352 /* Points to the next UTF-8 glyph in a string */
 353 /* from the current position.                 */
 354 /* ========================================== */
 355 char *
 356 utf8_next(char *p)
 357 {
 358   if (*p)
 359   {
 360     for (++p; (*p & 0xc0) == 0x80; ++p)
 361       ;
 362   }
 363
 364   return *p == '\0' ? NULL : p;
 365 }
 366
 367 /* ============================================================= */
 368 /* Replaces any UTF-8 glyph present in s by a substitution       */
 369 /* character in-place.                                           */
 370 /* s will be modified but its address in memory will not change. */
 371 /* ============================================================= */
 372 void
 373 utf8_sanitize(char *s, char substitute)
 374 {
 375   char  *p = s;
 376   size_t len;
 377
 378   len = strlen(s);
 379   while (*p)
 380   {
 381     int n;
 382
 383     n = utf8_get_length(*p);
 384
 385     if (n > 1)
 386     {
 387       *p = substitute;
 388       memmove(p + 1, p + n, len - (p - s) - n + 1);
 389       len -= (n - 1);
 390     }
 391
 392     p++;
 393   }
 394 }
 395
 396 /* ======================================================================= */
 397 /* This function scans the '\0'-terminated string starting at s.           */
 398 /* It returns a pointer to the first byte of the first malformed           */
 399 /* or overlong UTF-8 sequence found, or NULL if the string contains only   */
 400 /* correct UTF-8.                                                          */
 401 /* It also spots UTF-8 sequences that could cause trouble if converted to  */
 402 /* UTF-16, namely surrogate characters (U+D800..U+DFFF) and non-Unicode    */
 403 /* positions (U+FFFE..U+FFFF).                                             */
 404 /* This routine is very likely to find a malformed sequence if the input   */
 405 /* uses any other encoding than UTF-8.                                     */
 406 /* It therefore can be used as a very effective heuristic for              */
 407 /* distinguishing between UTF-8 and other encodings.                       */
 408 /*                                                                         */
 409 /* I wrote this code mainly as a specification of functionality; there     */
 410 /* are no doubt performance optimizations possible for certain CPUs.       */
 411 /*                                                                         */
 412 /* Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30             */
 413 /* License: http://www.cl.cam.ac.uk/~mgk25/short-license.html              */
 414 /* ======================================================================= */
 415 char *
 416 utf8_validate(char *s)
 417 {
 418   unsigned char *us = (unsigned char *)s;
 419
 420   /* clang-format off */
 421   while (*us)
 422   {
 423     if (*us < 0x80)
 424       /* 0xxxxxxx */
 425       us++;
 426     else if ((us[0] & 0xe0) == 0xc0)
 427     {
 428       /* 110XXXXx 10xxxxxx */
 429       if ((us[1] & 0xc0) != 0x80 || (us[0] & 0xfe) == 0xc0) /* overlong? */
 430         return (char *)us;
 431
 432       us += 2;
 433     }
 434     else if ((us[0] & 0xf0) == 0xe0)
 435     {
 436       /* 1110XXXX 10Xxxxxx 10xxxxxx */
 437       if ((us[1] & 0xc0) != 0x80 ||
 438           (us[2] & 0xc0) != 0x80 ||
 439           (us[0] == 0xe0 && (us[1] & 0xe0) == 0x80) || /* overlong?         */
 440           (us[0] == 0xed && (us[1] & 0xe0) == 0xa0) || /* surrogate?        */
 441           (us[0] == 0xef && us[1] == 0xbf &&
 442             (us[2] & 0xfe) == 0xbe))                   /* U+FFFE or U+FFFF? */
 443         return (char *)us;
 444
 445       us += 3;
 446     }
 447     else if ((us[0] & 0xf8) == 0xf0)
 448     {
 449       /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
 450       if ((us[1] & 0xc0) != 0x80 ||
 451           (us[2] & 0xc0) != 0x80 ||
 452           (us[3] & 0xc0) != 0x80 ||
 453           (us[0] == 0xf0 && (us[1] & 0xf0) == 0x80) ||     /* overlong?   */
 454           (us[0] == 0xf4 && us[1] > 0x8f) || us[0] > 0xf4) /* > U+10FFFF? */
 455         return (char *)us;
 456
 457       us += 4;
 458     }
 459     else
 460       return (char *)us;
 461   }
 462   /* clang-format on */
 463
 464   return NULL;
 465 }
 466
 467 /* ======================= */
 468 /* Multibyte UTF-8 strlen. */
 469 /* ======================= */
 470 size_t
 471 utf8_strlen(char const *str)
 472 {
 473   size_t i = 0, j = 0;
 474
 475   while (str[i])
 476   {
 477     if ((str[i] & 0xc0) != 0x80)
 478       j++;
 479     i++;
 480   }
 481   return j;
 482 }
 483
 484 /* ==================================================================== */
 485 /* Multibytes extraction of the prefix of n UTF-8 glyphs from a string. */
 486 /* The destination string d must have been allocated before.            */
 487 /* pos is updated to reflect the position AFTER the prefix.             */
 488 /* ==================================================================== */
 489 char *
 490 utf8_strprefix(char *d, char const *s, long n, long *pos)
 491 {
 492   long i = 0;
 493   long j = 0;
 494
 495   *pos = 0;
 496
 497   while (s[i] && j < n)
 498   {
 499     d[i] = s[i];
 500     i++;
 501     j++;
 502     while (s[i] && (s[i] & 0xC0) == 0x80)
 503     {
 504       d[i] = s[i];
 505       i++;
 506     }
 507   }
 508
 509   *pos = i;
 510
 511   d[i] = '\0';
 512
 513   return d;
 514 }
 515
 516 /* ================================================== */
 517 /* Converts a UTF-8 glyph string to a wchar_t string. */
 518 /* The returned string must be freed by the caller.   */
 519 /* ================================================== */
 520 wchar_t *
 521 utf8_strtowcs(char *s)
 522 {
 523   int            converted = 0;
 524   unsigned char *ch;
 525   wchar_t       *wptr, *w;
 526   size_t         size;
 527
 528   size = (long)strlen(s);
 529   w    = xmalloc((size + 1) * sizeof(wchar_t));
 530   w[0] = L'\0';
 531
 532   wptr = w;
 533   for (ch = (unsigned char *)s; *ch; ch += converted)
 534   {
 535     if ((converted = mbtowc(wptr, (char *)ch, 4)) > 0)
 536       wptr++;
 537     else
 538     {
 539       *wptr++   = (wchar_t)*ch;
 540       converted = 1;
 541     }
 542   }
 543
 544   *wptr = L'\0';
 545
 546   return w;
 547 }
 548
 549 /* ============================================================== */
 550 /* Poor man UTF-8 aware strtolower version.                       */
 551 /* Replaces all ASCII characters in src by its lowercase version. */
 552 /* dst must be preallocated before the call.                      */
 553 /* ============================================================== */
 554 void
 555 utf8_strtolower(char *dst, char *src)
 556 {
 557   unsigned char c;
 558
 559   while ((c = *src))
 560   {
 561     if (c >= 0x80)
 562       *dst = c;
 563     else
 564       *dst = tolower(c);
 565
 566     src++;
 567     dst++;
 568   }
 569
 570   *dst = '\0';
 571 }