nestlex.c

   1 /* source: nestlex.c */
   2 /* Copyright Gerhard Rieger and contributors (see file CHANGES) */
   3 /* Published under the GNU General Public License V.2, see file COPYING */
   4
   5 /* a function for lexical scanning of nested character patterns */
   6
   7 #include "config.h"
   8 #include "mytypes.h"
   9
  10 #include "sysincludes.h"
  11
  12 static int _nestlex(const char **addr,
  13                     char **token,
  14                     ptrdiff_t *len,
  15                     const char *ends[],
  16                     const char *hquotes[],
  17                     const char *squotes[],
  18                     const char *nests[],
  19                     bool dropquotes,
  20                     bool c_esc,
  21                     bool html_esc
  22                     );
  23
  24 /* sub: scan a string and copy its value to output string
  25    end scanning when an unescaped, unnested string from ends array is found
  26    does not copy the end pattern
  27    does not write a trailing \0 to token
  28    allows escaping with \ and quoting (\ and quotes are removed)
  29    allows nesting with div. parens
  30    returns -1 if out string was too small
  31    returns 1 if addr ended unexpectedly
  32    returns 0 if token could be extracted successfully
  33 */
  34 int nestlex(const char **addr,  /* input string; aft points to end token */
  35             char **token,       /* output token; aft points to first unwritten
  36                                    char (caller might want to set it to \0) */
  37             size_t *len,        /* remaining bytes in token space (incl. \0) */
  38             const char *ends[], /* list of end strings */
  39             const char *hquotes[],/* list of strings that quote (hard qu.) */
  40             const char *squotes[],/* list of strings that quote softly */
  41             const char *nests[],/* list of strings that start nesting;
  42                                    every second one is matching end */
  43             bool dropquotes,    /* drop the outermost quotes */
  44             bool c_esc,         /* solve C char escapes: \n \t \0 etc */
  45             bool html_esc       /* solve HTML char escapes: %0d %08 etc */
  46             ) {
  47    return
  48       _nestlex(addr, token, (ptrdiff_t *)len, ends, hquotes, squotes, nests,
  49                dropquotes, c_esc, html_esc);
  50 }
  51
  52 static int _nestlex(const char **addr,
  53                     char **token,
  54                     ptrdiff_t *len,
  55                     const char *ends[],
  56                     const char *hquotes[],
  57                     const char *squotes[],
  58                     const char *nests[],
  59                     bool dropquotes,
  60                     bool c_esc,
  61                     bool html_esc
  62                     ) {
  63    const char *in = *addr;      /* pointer into input string */
  64    const char **endx;   /* loops over end patterns */
  65    const char **quotx;  /* loops over quote patterns */
  66    const char **nestx;  /* loops over nest patterns */
  67    char *out = *token;  /* pointer into output token */
  68    char c;
  69    int i;
  70    int result;
  71
  72    while (true) {
  73
  74       /* is this end of input string? */
  75       if (*in == 0)  {
  76
  77          break; /* end of string */
  78       }
  79
  80       /* first check the end patterns (e.g. for ']') */
  81       endx = ends;  i = 0;
  82       while (*endx) {
  83          if (!strncmp(in, *endx, strlen(*endx))) {
  84             /* this end pattern matches */
  85             *addr = in;
  86             *token = out;
  87             return 0;
  88          }
  89          ++endx;
  90       }
  91
  92       /* check for hard quoting pattern */
  93       quotx = hquotes;
  94       while (hquotes && *quotx) {
  95          if (!strncmp(in, *quotx, strlen(*quotx))) {
  96             /* this quote pattern matches */
  97             const char *endnest[2];
  98             if (dropquotes) {
  99                /* we strip this quote */
 100                in += strlen(*quotx);
 101             } else {
 102                for (i = strlen(*quotx); i > 0; --i) {
 103                   *out++ = *in++;
 104                   if (--*len <= 0) { *addr = in; *token = out; return -1; }
 105                }
 106             }
 107             /* we call _nestlex recursively */
 108             endnest[0] = *quotx;
 109             endnest[1] = NULL;
 110             result =
 111                _nestlex(&in, &out, len, endnest, NULL/*hquotes*/,
 112                        NULL/*squotes*/, NULL/*nests*/,
 113                        false, c_esc, html_esc);
 114             if (result == 0 && dropquotes) {
 115                /* we strip this quote */
 116                in += strlen(*quotx);
 117             } else if (result < 0) {
 118                *addr = in; *token = out; return result;
 119             } else {
 120                /* we copy the trailing quote */
 121                for (i = strlen(*quotx); i > 0; --i) {
 122                   *out++ = *in++;
 123                   if (--*len <= 0) { *addr = in; *token = out; return -1; }
 124                }
 125             }
 126
 127             break;
 128          }
 129          ++quotx;
 130       }
 131       if (hquotes && *quotx != NULL) {
 132          /* there was a quote; string might continue with hard quote */
 133          continue;
 134       }
 135
 136       /* check for soft quoting pattern */
 137       quotx = squotes;
 138       while (squotes && *quotx) {
 139          if (!strncmp(in, *quotx, strlen(*quotx))) {
 140             /* this quote pattern matches */
 141             /* we strip this quote */
 142             /* we call _nestlex recursively */
 143             const char *endnest[2];
 144             if (dropquotes) {
 145                /* we strip this quote */
 146                in += strlen(*quotx);
 147             } else {
 148                for (i = strlen(*quotx); i > 0; --i) {
 149                   *out++ = *in++;
 150                   if (--*len <= 0) { *addr = in; *token = out; return -1; }
 151                }
 152             }
 153             endnest[0] = *quotx;
 154             endnest[1] = NULL;
 155             result =
 156                _nestlex(&in, &out, len, endnest, hquotes,
 157                        squotes, nests,
 158                        false, c_esc, html_esc);
 159
 160             if (result == 0 && dropquotes) {
 161                /* we strip the trailing quote */
 162                if (!in[0] || strncmp(in, *quotx, strlen(*quotx)))  return 1;
 163                in += strlen(*quotx);
 164             } else if (result < 0) {
 165                *addr = in; *token = out; return result;
 166             } else {
 167                /* we copy the trailing quote */
 168                for (i = strlen(*quotx); i > 0; --i) {
 169                   *out++ = *in++;
 170                   if (--*len <= 0) { *addr = in; *token = out; return -1; }
 171                }
 172             }
 173             break;
 174          }
 175          ++quotx;
 176       }
 177       if (squotes && *quotx != NULL) {
 178          /* there was a soft quote; string might continue with any quote */
 179          continue;
 180       }
 181
 182       /* check patterns that start a nested clause */
 183       nestx = nests;  i = 0;
 184       while (nests && *nestx) {
 185          if (!strncmp(in, *nestx, strlen(*nestx))) {
 186             /* this nest pattern matches */
 187             const char *endnest[2];
 188             endnest[0] = nestx[1];
 189             endnest[1] = NULL;
 190
 191             for (i = strlen(nestx[1]); i > 0; --i) {
 192                *out++ = *in++;
 193                if (--*len <= 0)  { *addr = in; *token = out; return -1; }
 194             }
 195
 196             result =
 197                _nestlex(&in, &out, len, endnest, hquotes, squotes, nests,
 198                        false, c_esc, html_esc);
 199             if (result == 0) {
 200                /* copy endnest */
 201                i = strlen(nestx[1]); while (i > 0) {
 202                   *out++ = *in++;
 203                   if (--*len <= 0) {
 204                      *addr = in;
 205                      *token = out;
 206                      return -1;
 207                   }
 208                   --i;
 209                }
 210             } else if (result < 0) {
 211                *addr = in; *token = out; return result;
 212             }
 213             break;
 214          }
 215          nestx += 2;    /* skip matching end pattern in table */
 216       }
 217       if (nests && *nestx) {
 218          /* we handled a nested expression, continue loop */
 219          continue;
 220       }
 221
 222       /* "normal" data, possibly escaped */
 223       c = *in++;
 224       if (c == '\\') {
 225          /* found a plain \ escaped part */
 226          c = *in++;
 227          if (c == 0)  { /* Warn("trailing '\\'");*/ break; }
 228          if (c_esc) { /* solve C char escapes: \n \t \0 etc */
 229             switch (c) {
 230             case '0': c = '\0'; break;
 231             case 'a': c = '\a'; break;
 232             case 'b': c = '\b'; break;
 233             case 'f': c = '\f'; break;
 234             case 'n': c = '\n'; break;
 235             case 'r': c = '\r'; break;
 236             case 't': c = '\t'; break;
 237             case 'v': c = '\v'; break;
 238 #if LATER
 239             case 'x': !!! 1 to 2 hex digits; break;
 240             case 'u': !!! 4 hex digits?; break;
 241             case 'U': !!! 8 hex digits?; break;
 242 #endif
 243             default: break;
 244             }
 245          }
 246          *out++ = c;
 247          --*len;
 248          if (*len <= 0) {
 249             *addr = in;
 250             *token = out;
 251             return -1;  /* output overflow */
 252          }
 253          continue;
 254       }
 255
 256       /* just a simple char */
 257       *out++ = c;
 258       --*len;
 259       if (*len <= 0) {
 260          *addr = in;
 261          *token = out;
 262          return -1;     /* output overflow */
 263       }
 264
 265    }
 266    /* never come here? */
 267
 268    *addr = in;
 269    *token = out;
 270    return 0;    /* OK */
 271 }