diff --git a/usr/src/lib/libc/regex/engine.c b/usr/src/lib/libc/regex/engine.c --- a/usr/src/lib/libc/regex/engine.c +++ b/usr/src/lib/libc/regex/engine.c @@ -146,6 +146,8 @@ #define NOTE(s) /* nothing */ #endif + + /* * Given a multibyte string pointed to by start, step back nchar characters * from current position pointed to by cur. @@ -241,9 +243,8 @@ /* We depend on not being used for * for strings of length 1 */ - while (*--dp == *--pp && pp != mustfirst); - - if (*dp == *pp) + while (!XCOMP(g, *--dp, *--pp) && pp != mustfirst); + if (!XCOMP(g, *dp, *pp)) break; /* Jump to next possible match */ @@ -256,9 +257,9 @@ return(REG_NOMATCH); } else { for (dp = start; dp < stop; dp++) - if (*dp == g->must[0] && + if (!XCOMP(g, *dp, g->must[0]) && stop - dp >= g->mlen && - memcmp(dp, g->must, (size_t)g->mlen) == 0) + XXMEMCMP(g, dp, g->must, (size_t)g->mlen) == 0) break; if (dp == stop) /* we didn't find g->must */ return(REG_NOMATCH); @@ -452,6 +453,7 @@ case OEND: assert(nope); break; + case OICHAR: case OCHAR: sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; @@ -642,10 +644,11 @@ for (ss = startst; !hard && ss < stopst; ss++) switch (OP(s = m->g->strip[ss])) { case OCHAR: + case OICHAR: if (sp == stop) return(NULL); sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); - if (wc != OPND(s)) + if (XCOMP(m->g, wc, OPND(s))) return(NULL); break; case OANY: @@ -757,7 +760,7 @@ if (sp > stop - len) return(NULL); /* not enough left to match */ ssp = m->offp + m->pmatch[i].rm_so; - if (memcmp(sp, ssp, len) != 0) + if (XXMEMCMP(m->g, sp, ssp, len) != 0) return(NULL); while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) ss++; @@ -1014,9 +1017,10 @@ assert(pc == stop-1); break; case OCHAR: + case OICHAR: /* only characters can match */ assert(!NONCHAR(ch) || ch != OPND(s)); - if (ch == OPND(s)) + if (!XCOMP(g, ch, OPND(s))) FWD(aft, bef, 1); break; case OBOS: @@ -1121,6 +1125,8 @@ return(aft); } + + #ifdef REDEBUG /* - print - print a set of states diff --git a/usr/src/lib/libc/regex/regcomp.c b/usr/src/lib/libc/regex/regcomp.c --- a/usr/src/lib/libc/regex/regcomp.c +++ b/usr/src/lib/libc/regex/regcomp.c @@ -1394,10 +1394,21 @@ { cset *cs; - if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) - bothcases(p, ch); - else if ((ch & OPDMASK) == ch) - EMIT(OCHAR, ch); + int isb; + isb = ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch); + if ((ch & OPDMASK) == ch) { + if (!isb) + EMIT(OCHAR, ch); + else { + /* is icase */ + if ((ch & 0x7f) == ch) + EMIT(OICHAR, tolower(ch)); + else { + /* wide char case insensitive */ + bothcases(p, ch); + } + } + } else { /* * Kludge: character is too big to fit into an OCHAR operand. @@ -1877,6 +1888,7 @@ do { s = *scan++; switch (OP(s)) { + case OICHAR: case OCHAR: /* sequence member */ if (newlen == 0) { /* new sequence */ memset(&mbs, 0, sizeof(mbs)); @@ -2003,8 +2015,12 @@ scan = start; memset(&mbs, 0, sizeof(mbs)); while (cp < g->must + g->mlen) { - while (OP(s = *scan++) != OCHAR) - continue; + + s = *scan++; + while (OP(s) != OCHAR && OP(s) != OICHAR) { + s = *scan++; + } + clen = wcrtomb(cp, OPND(s), &mbs); assert(clen != (size_t)-1); cp += clen; @@ -2124,8 +2140,11 @@ * (notice that we match right to left, so that last character * is the first one that would be matched). */ - for (mindex = 0; mindex < g->mlen; mindex++) + for (mindex = 0; mindex < g->mlen; mindex++) { g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; + if (g->cflags & REG_ICASE) + g->charjump[othercase((int)g->must[mindex])] = g->mlen - mindex - 1; + } } /* @@ -2141,6 +2160,7 @@ * Notice that all values here are minus (g->mlen-1), because of the way * the search algorithm works. */ + static void computematchjumps(struct parse *p, struct re_guts *g) { @@ -2184,7 +2204,7 @@ * substring. */ while (suffix < g->mlen - && g->must[mindex] != g->must[suffix]) { + && XCOMP(g, g->must[mindex], g->must[suffix])) { g->matchjump[suffix] = MIN(g->matchjump[suffix], g->mlen - mindex - 1); suffix = pmatches[suffix]; @@ -2246,3 +2266,27 @@ g->iflags |= BAD; return(maxnest); } + +/** + * Compare two UTF-8 strings case-insensitively + * + * @param s1 First UTF-8 string + * @param s2 Second UTF-8 string + * @param len Length of second string in bytes + * @return 0 if equal, <0 if s1 < s2, >0 if s1 > s2, -2 on error + */ +int icasecmp(const char *s1, const char *s2, size_t len) { + + int i = 0; + + while (i < len) { + + if (tolower(s1[i]) != tolower(s2[i])) { + return (s1[i] < s2[i]) ? -1 : 1; + } + i++; + } + + // If we've reached the end of both strings, they're equal + return 0; +} \ No newline at end of file diff --git a/usr/src/lib/libc/regex/regex2.h b/usr/src/lib/libc/regex/regex2.h --- a/usr/src/lib/libc/regex/regex2.h +++ b/usr/src/lib/libc/regex/regex2.h @@ -107,7 +107,7 @@ #define OEOS (22L<cflags & REG_ICASE) ? (towlower(c1) != towlower(c2)) : \ + ((c1) != (c2))) +#define XXMEMCMP(g, s1, s2, len) \ + ((g->cflags & REG_ICASE) ? icasecmp(s1, s2, len) : \ + memcmp(s1, s2, len))