Wednesday, November 7, 2007

Parsing Integers

Every once in a while, it becomes necessary to parse an integer from a string. The trick is doing so correctly and without the need to use an integer type larger than what you are parsing into (i.e. if you are trying to parse a 32bit integer, you don't want to have to use a temporary 64bit int value for checking overflow - this is especially desirable if you are trying to parse larger int values, such as a 64bit int because you won't likely have access to a 128bit int type).

The following macro correctly implements an integer parsing routine for a given number of bits:

#define STRTOINT(bits, max)                                     \
static int                                                      \
strtoint##bits (const char *in, char **inend,                   \
                int##bits##_t *retval, int *err)                \
{                                                               \
    register const char *inptr = in;                            \
    int##bits##_t val = 0;                                      \
    int digit, sign = 1;                                        \
                                                                \
    while (*inptr == ' ')                                       \
        inptr++;                                                \
                                                                \
    if (*inptr == '-') {                                        \
        sign = -1;                                              \
        inptr++;                                                \
    }                                                           \
                                                                \
    in = inptr;                                                 \
    while (*inptr >= '0' && *inptr <= '9')                      \
        digit = (*inptr - '0');                                 \
        if (val > (max / 10)) {                                 \
            *err = EOVERFLOW;                                   \
            return -1;                                          \
        } else if (val == (max / 10)) {                         \
            if (digit > (max % 10) &&                           \
                (sign > 0 || digit > ((max % 10) + 1))) {       \
                *err = EOVERFLOW;                               \
                return -1;                                      \
            }                                                   \
                                                                \
            if (sign < 0)                                       \
                val = (val * sign * 10) - digit;                \
            else                                                \
                val = (val * 10) + digit;                       \
                                                                \
            inptr++;                                            \
            if (*inptr >= '0' && *inptr <= '9') {               \
                *err = EOVERFLOW;                               \
                return -1;                                      \
            }                                                   \
                                                                \
            goto ret;                                           \
        } else {                                                \
            val = (val * 10) + digit;                           \
        }                                                       \
                                                                \
        inptr++;                                                \
    }                                                           \
                                                                \
    if (inptr == in) {                                          \
        *err = EINVAL;                                          \
        return -1;                                              \
    }                                                           \
                                                                \
    val *= sign;                                                \
                                                                \
 ret:                                                           \
    *inend = (char *) inptr;                                    \
    *retval = val;                                              \
                                                                \
    return 0;                                                   \
}

To expand this macro to parse a 32bit integer value from a string, you would do:

STRTOINT(32, 2147483647L)

The resulting function would be named strtoint32().

Why implement my own integer parsing routine instead of using libc's strtol(), you ask?

For a variety of reasons:

  • strtol() might not be available or you might want to implement an integer parsing routine in a language other than c/c++ (a variation of my routine is now used in Mono for all of the Int.Parse() methods).
  • strtol() is locale dependent which is not always a desirable trait.
  • strtol() requires annoying error checking that using my function makes trivial.
  • it's easy to extend my function macro to support integers of any bit size, e.g. 64bit integers (or someday 128bit integers).

To generate functions to parse 8, 16, 32, and 64bit integers, you would write:

STRTOINT(8, 127)
STRTOINT(16, 32767)
STRTOINT(32, 2147483647L)
STRTOINT(64, 9223372036854775807LL)

What about unsigned integers, you ask? Not to worry, I have an implementation for that as well:

#define STRTOUINT(bits, max)                                    \
static int                                                      \
strtouint##bits (const char *in, char **inend,                  \
                 uint##bits##_t *retval, int *err)              \
{                                                               \
    register const char *inptr = in;                            \
    uint##bits##_t val = 0;                                     \
    int digit;                                                  \
                                                                \
    while (*inptr == ' ')                                       \
        inptr++;                                                \
                                                                \
    in = inptr;                                                 \
    while (*inptr >= '0' && *inptr <= '9') {                    \
        digit = (*inptr - '0');                                 \
        if (val > (max / 10)) {                                 \
            *err = EOVERFLOW;                                   \
            return -1;                                          \
        } else if (val == (max / 10) && digit > (max % 10)) {   \
            *err = EOVERFLOW;                                   \
            return -1;                                          \
        } else {                                                \
            val = (val * 10) + digit;                           \
        }                                                       \
                                                                \
        inptr++;                                                \
    }                                                           \
                                                                \
    if (inptr == in) {                                          \
        *err = EINVAL;                                          \
        return -1;                                              \
    }                                                           \
                                                                \
    *inend = (char *) inptr;                                    \
    *retval = val;                                              \
                                                                \
    return 0;                                                   \
}

STRTOUINT(8, 255)
STRTOUINT(16, 65535)
STRTOUINT(32, 4294967295UL)
STRTOUINT(64, 18446744073709551615ULL)

And there you have it.

Note: these routines require [u]int[8,16,32,64]_t to be defined. If you are on Linux, #include <stdint.h> should do the trick.

Update: Miguel's blog goes into more detail on why this code was important for Mono.

Post a Comment

Code Snippet Licensing

All code posted to this blog is licensed under the MIT/X11 license unless otherwise stated in the post itself.