From 18efeb320b763e541a7dbf61a7da1cbe13ab2be9 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 16 Apr 2012 16:03:45 -0400 Subject: new scanf implementation and corresponding integer parser/converter advantages over the old code: - correct results for floating point (old code was bogus) - wide/regular scanf separated so scanf does not pull in wide code - well-defined behavior on integers that overflow dest type - support for %[a-b] ranges with %[ (impl-defined by widely used) - no intermediate conversion of fmt string to wide string - cleaner, easier to share code with strto* functions - better standards conformance for corner cases the old code remains in the source tree, as the wide versions of the scanf-family functions are still using it. it will be removed when no longer needed. --- src/internal/intscan.c | 97 +++++++++++++ src/internal/intscan.h | 8 ++ src/internal/stdio_impl.h | 2 + src/stdio/__string_read.c | 13 ++ src/stdio/vfscanf.c | 338 +++++++++++++++++++++++++++++++++++++++++++--- src/stdio/vsscanf.c | 22 ++- 6 files changed, 450 insertions(+), 30 deletions(-) create mode 100644 src/internal/intscan.c create mode 100644 src/internal/intscan.h create mode 100644 src/stdio/__string_read.c diff --git a/src/internal/intscan.c b/src/internal/intscan.c new file mode 100644 index 00000000..a00f2ccc --- /dev/null +++ b/src/internal/intscan.c @@ -0,0 +1,97 @@ +#include +#include +#include "shgetc.h" + +/* Lookup table for digit values. -1==255>=36 -> invalid */ +static const unsigned char table[] = { -1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1, +-1,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24, +25,26,27,28,29,30,31,32,33,34,35,-1,-1,-1,-1,-1, +-1,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24, +25,26,27,28,29,30,31,32,33,34,35,-1,-1,-1,-1,-1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, +-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, +}; + +unsigned long long __intscan(FILE *f, unsigned base, int pok, unsigned long long lim) +{ + const unsigned char *val = table+1; + int c, neg=0; + unsigned x; + unsigned long long y; + if (base > 36) { + errno = EINVAL; + return 0; + } + c = shgetc(f); + if (c=='+' || c=='-') { + neg = -(c=='-'); + c = shgetc(f); + } + if ((base == 0 || base == 16) && c=='0') { + c = shgetc(f); + if ((c|32)=='x') { + c = shgetc(f); + if (val[c]>=16) { + shunget(f); + if (pok) shunget(f); + else shlim(f, 0); + return 0; + } + base = 16; + } else if (base == 0) { + base = 8; + } + } else { + if (base == 0) base = 10; + if (val[c] >= base) { + shlim(f, 0); + errno = EINVAL; + return 0; + } + } + if (base == 10) { + for (x=0; c-'0'<10U && x<=UINT_MAX/10-1; c=shgetc(f)) + x = x*10 + (c-'0'); + for (y=x; c-'0'<10U && y<=ULLONG_MAX/10 && 10*y<=ULLONG_MAX-(c-'0'); c=shgetc(f)) + y = y*10 + (c-'0'); + if (c-'0'>=10U) goto done; + } else if (!(base & base-1)) { + int bs = "\0\1\2\4\7\3\6\5"[(0x17*base)>>5&7]; + for (x=0; val[c]>bs; c=shgetc(f)) + y = y<=lim) { + if (!(lim&1) && !neg) { + errno = ERANGE; + return lim-1; + } else if (y>lim) { + errno = ERANGE; + return lim; + } + } + return (y^neg)-neg; +} diff --git a/src/internal/intscan.h b/src/internal/intscan.h new file mode 100644 index 00000000..994c5e7d --- /dev/null +++ b/src/internal/intscan.h @@ -0,0 +1,8 @@ +#ifndef INTSCAN_H +#define INTSCAN_H + +#include + +unsigned long long __intscan(FILE *, unsigned, int, unsigned long long); + +#endif diff --git a/src/internal/stdio_impl.h b/src/internal/stdio_impl.h index 5ec296f3..af7aacc8 100644 --- a/src/internal/stdio_impl.h +++ b/src/internal/stdio_impl.h @@ -69,6 +69,8 @@ size_t __stdout_write(FILE *, const unsigned char *, size_t); off_t __stdio_seek(FILE *, off_t, int); int __stdio_close(FILE *); +size_t __string_read(FILE *, unsigned char *, size_t); + int __toread(FILE *); int __towrite(FILE *); diff --git a/src/stdio/__string_read.c b/src/stdio/__string_read.c new file mode 100644 index 00000000..5c3728d7 --- /dev/null +++ b/src/stdio/__string_read.c @@ -0,0 +1,13 @@ +#include "stdio_impl.h" + +size_t __string_read(FILE *f, unsigned char *buf, size_t len) +{ + char *src = f->cookie; + size_t k = strnlen(src, len+256); + if (k < len) len = k; + memcpy(buf, src, len); + f->rpos = (void *)(src+len); + f->rend = (void *)(src+k); + f->cookie = src+k; + return len; +} diff --git a/src/stdio/vfscanf.c b/src/stdio/vfscanf.c index 414c2a3d..5c1e49b1 100644 --- a/src/stdio/vfscanf.c +++ b/src/stdio/vfscanf.c @@ -1,36 +1,342 @@ #include +#include +#include +#include +#include +#include +#include #include #include -#include +#include +#include #include "stdio_impl.h" -#include "__scanf.h" +#include "shgetc.h" +#include "intscan.h" +#include "floatscan.h" -static void f_read(rctx_t *r) +#define SIZE_hh -2 +#define SIZE_h -1 +#define SIZE_def 0 +#define SIZE_l 1 +#define SIZE_L 2 +#define SIZE_ll 3 + +static void store_int(void *dest, int size, unsigned long long i) { - FILE *f = r->opaque; - if ((r->c = getc_unlocked(f)) >= 0) r->l++; + if (!dest) return; + switch (size) { + case SIZE_hh: + *(char *)dest = i; + break; + case SIZE_h: + *(short *)dest = i; + break; + case SIZE_def: + *(int *)dest = i; + break; + case SIZE_l: + *(long *)dest = i; + break; + case SIZE_ll: + *(long long *)dest = i; + break; + } } -int vfscanf(FILE *f, const char *fmt, va_list ap) +static void *arg_n(va_list ap, unsigned int n) { - size_t l = strlen(fmt), i, result; - rctx_t r = { f_read, (void *)f, 0, isspace }; - wchar_t fmt2[l+1]; + void *p; + unsigned int i; + va_list ap2; + va_copy(ap2, ap); + for (i=n; i>1; i--) va_arg(ap2, void *); + p = va_arg(ap2, void *); + va_end(ap2); + return p; +} - if (l > 0x100000) { - errno = ENOMEM; +static int readwc(int c, wchar_t **wcs, mbstate_t *st) +{ + char ch = c; + wchar_t wc; + switch (mbrtowc(&wc, &ch, 1, st)) { + case -1: return -1; + case -2: + break; + default: + if (*wcs) *(*wcs)++ = wc; } - for (i=0; i<=l; i++) fmt2[i] = (unsigned char)fmt[i]; + return 0; +} + +int vfscanf(FILE *f, const char *fmt, va_list ap) +{ + int width; + int size; + int alloc; + int base; + const unsigned char *p; + int c, t; + char *s; + wchar_t *wcs; + mbstate_t st; + void *dest=NULL; + int invert; + int matches=0; + unsigned long long x; + long double y; + off_t pos = 0; FLOCK(f); - result = __scanf(&r, fmt2, ap); + for (p=(const unsigned char *)fmt; *p; p++) { + + if (isspace(*p)) { + while (isspace(p[1])) p++; + shlim(f, 0); + while (isspace(shgetc(f))); + shunget(f); + pos += shcnt(f); + continue; + } + if (*p != '%' || p[1] == '%') { + p += *p=='%'; + c = shgetc(f); + if (c!=*p) { + shunget(f); + if (c<0) goto input_fail; + goto match_fail; + } + pos++; + continue; + } + + p++; + if (*p=='*') { + dest = 0; p++; + } else if (isdigit(*p) && p[1]=='$') { + dest = arg_n(ap, *p-'0'); p+=2; + } else { + dest = va_arg(ap, void *); + } + + for (width=0; isdigit(*p); p++) { + width = 10*width + *p - '0'; + } - if (r.u && r.c >= 0) - ungetc(r.c, f); + if (*p=='m') { + alloc = 1; + p++; + } else { + alloc = 0; + } + size = SIZE_def; + switch (*p++) { + case 'h': + if (*p == 'h') p++, size = SIZE_hh; + else size = SIZE_h; + break; + case 'l': + if (*p == 'l') p++, size = SIZE_ll; + else size = SIZE_l; + break; + case 'j': + size = SIZE_ll; + break; + case 'z': + case 't': + size = SIZE_l; + break; + case 'L': + size = SIZE_L; + break; + case 'd': case 'i': case 'o': case 'u': case 'x': + case 'a': case 'e': case 'f': case 'g': + case 'A': case 'E': case 'F': case 'G': case 'X': + case 's': case 'c': case '[': + case 'S': case 'C': + case 'p': case 'n': + p--; + break; + default: + goto fmt_fail; + } + + t = *p; + + switch (t) { + case 'C': + case 'c': + if (width < 1) width = 1; + case 's': + if (size == SIZE_l) t &= ~0x20; + case 'd': case 'i': case 'o': case 'u': case 'x': + case 'a': case 'e': case 'f': case 'g': + case 'A': case 'E': case 'F': case 'G': case 'X': + case '[': case 'S': + case 'p': case 'n': + if (width < 1) width = 0; + break; + default: + goto fmt_fail; + } + + shlim(f, width); + + if (t != 'n') { + if (shgetc(f) < 0) goto input_fail; + shunget(f); + } + + switch (t) { + case 'n': + store_int(dest, size, pos); + /* do not increment match count, etc! */ + continue; + case 'C': + wcs = dest; + st = (mbstate_t){ 0 }; + while ((c=shgetc(f)) >= 0) { + if (readwc(c, &wcs, &st) < 0) + goto input_fail; + } + if (!mbsinit(&st)) goto input_fail; + if (shcnt(f) != width) goto match_fail; + break; + case 'c': + if (dest) { + s = dest; + while ((c=shgetc(f)) >= 0) *s++ = c; + } else { + while (shgetc(f)>=0); + } + if (shcnt(f) < width) goto match_fail; + break; + case '[': + s = dest; + wcs = dest; + + if (*++p == '^') p++, invert = 1; + else invert = 0; + + unsigned char scanset[257]; + memset(scanset, invert, sizeof scanset); + + scanset[0] = 0; + if (*p == '-') p++, scanset[1+'-'] = 1-invert; + if (*p == ']') p++, scanset[1+']'] = 1-invert; + for (; *p && *p != ']'; p++) { + if (*p=='-' && p[1] != ']') + for (c=p++[-1]; c<*p; c++) + scanset[1+c] = 1-invert; + scanset[1+*p] = 1-invert; + } + if (!*p) goto fmt_fail; + + if (size == SIZE_l) { + st = (mbstate_t){0}; + while (scanset[(c=shgetc(f))+1]) { + if (readwc(c, &wcs, &st) < 0) + goto input_fail; + } + if (!mbsinit(&st)) goto input_fail; + s = 0; + } else if (s) { + while (scanset[(c=shgetc(f))+1]) + *s++ = c; + wcs = 0; + } else { + while (scanset[(c=shgetc(f))+1]); + } + shunget(f); + if (!shcnt(f)) goto match_fail; + if (s) *s = 0; + if (wcs) *wcs = 0; + break; + default: + shlim(f, 0); + while (isspace(shgetc(f))); + shunget(f); + pos += shcnt(f); + shlim(f, width); + if (shgetc(f) < 0) goto input_fail; + shunget(f); + } + + switch (t) { + case 'p': + case 'X': + case 'x': + base = 16; + goto int_common; + case 'o': + base = 8; + goto int_common; + case 'd': + case 'u': + base = 10; + goto int_common; + case 'i': + base = 0; + int_common: + x = __intscan(f, base, 0, ULLONG_MAX); + if (!shcnt(f)) goto match_fail; + if (t=='p') *(void **)dest = (void *)(uintptr_t)x; + else store_int(dest, size, x); + break; + case 'a': case 'A': + case 'e': case 'E': + case 'f': case 'F': + case 'g': case 'G': + y = __floatscan(f, -1, size, 0); + if (!shcnt(f)) goto match_fail; + if (dest) switch (size) { + case SIZE_def: + *(float *)dest = y; + break; + case SIZE_l: + *(double *)dest = y; + break; + case SIZE_L: + *(long double *)dest = y; + break; + } + break; + case 'S': + wcs = dest; + st = (mbstate_t){ 0 }; + while (!isspace(c=shgetc(f)) && c!=EOF) { + if (readwc(c, &wcs, &st) < 0) + goto input_fail; + } + if (!mbsinit(&st)) goto input_fail; + if (dest) *wcs++ = 0; + break; + case 's': + if (dest) { + s = dest; + while (!isspace(c=shgetc(f)) && c!=EOF) + *s++ = c; + *s = 0; + } else { + while (!isspace(c=shgetc(f)) && c!=EOF); + } + shunget(f); + break; + } + + pos += shcnt(f); + if (dest) matches++; + } + if (0) { +fmt_fail: +input_fail: + if (!matches) matches--; + } +match_fail: FUNLOCK(f); - return result; + return matches; } diff --git a/src/stdio/vsscanf.c b/src/stdio/vsscanf.c index fd48f709..fbc15e69 100644 --- a/src/stdio/vsscanf.c +++ b/src/stdio/vsscanf.c @@ -1,21 +1,15 @@ -#include -#include -#include +#include "stdio_impl.h" -#include "__scanf.h" - -static void s_read(rctx_t *r) +static size_t do_read(FILE *f, unsigned char *buf, size_t len) { - unsigned char *s = r->opaque; - if (!s[r->l]) r->c = -1; - else r->c = s[r->l++]; + return __string_read(f, buf, len); } int vsscanf(const char *s, const char *fmt, va_list ap) { - size_t l = strlen(fmt), i; - wchar_t fmt2[l+1]; - rctx_t r = { s_read, (void *)s, 0, isspace }; - for (i=0; i<=l; i++) fmt2[i] = (unsigned char)fmt[i]; - return __scanf(&r, fmt2, ap); + FILE f = { + .buf = (void *)s, .cookie = (void *)s, + .read = do_read, .lock = -1 + }; + return vfscanf(&f, fmt, ap); } -- cgit v1.2.1