diff -u -r icu/source/common/unicode/utypes.h icu.new/source/common/unicode/utypes.h --- icu/source/common/unicode/utypes.h 2005-06-28 23:18:08.000000000 +0100 +++ icu.new/source/common/unicode/utypes.h 2006-08-09 17:09:09.000000000 +0100 @@ -706,6 +706,11 @@ U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */ U_REGEX_LOOK_BEHIND_LIMIT, /**< Look-Behind pattern matches must have a bounded maximum length. */ U_REGEX_SET_CONTAINS_STRING, /**< Regexps cannot have UnicodeSets containing strings.*/ + U_REGEX_UNKNOWN_GROUP_NAME, /**< Unrecognized group name. */ + U_REGEX_UNTERMINATED_GROUP_NAME, /**< Unterminated group name. */ + U_REGEX_DUPLICATE_GROUP_NAME, /**< Group names in a regexp must be unique. */ + U_REGEX_BAD_GROUP_NAME, /**< Bad group name in regexp. */ + U_REGEX_TOO_MANY_BRANCHES, /**< Conditional expression contains too many branches */ U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */ /* diff -u -r icu/source/common/utypes.c icu.new/source/common/utypes.c --- icu/source/common/utypes.c 2005-07-12 21:32:00.000000000 +0100 +++ icu.new/source/common/utypes.c 2006-08-09 17:09:01.000000000 +0100 @@ -154,7 +154,12 @@ "U_REGEX_INVALID_BACK_REF", "U_REGEX_INVALID_FLAG", "U_REGEX_LOOK_BEHIND_LIMIT", - "U_REGEX_SET_CONTAINS_STRING" + "U_REGEX_SET_CONTAINS_STRING", + "U_REGEX_UNKNOWN_GROUP_NAME", + "U_REGEX_UNTERMINATED_GROUP_NAME", + "U_REGEX_DUPLICATE_GROUP_NAME", + "U_REGEX_BAD_GROUP_NAME", + "U_REGEX_TOO_MANY_BRANCHES" }; /* TODO: replace the definition with _uIDNAErrorName[U_IDNA_ERROR_LIMIT - U_IDNA_ERROR_START ] * in ICU 3.6 diff -u -r icu/source/i18n/regexcmp.cpp icu.new/source/i18n/regexcmp.cpp --- icu/source/i18n/regexcmp.cpp 2004-12-30 07:25:50.000000000 +0000 +++ icu.new/source/i18n/regexcmp.cpp 2006-08-10 14:51:46.000000000 +0100 @@ -1,4 +1,3 @@ - // // file: regexcmp.cpp // @@ -24,6 +23,7 @@ #include "util.h" #include "cmemory.h" #include "cstring.h" +#include "uvector.h" #include "uvectr32.h" #include "uassert.h" #include "ucln_in.h" @@ -36,7 +36,6 @@ #include "regexst.h" - U_NAMESPACE_BEGIN @@ -60,7 +59,7 @@ fCharNum = 0; fQuoteMode = FALSE; fInBackslashQuote = FALSE; - fModeFlags = fRXPat->fFlags; + fModeFlags = fRXPat->fFlags & URX_USER_MODE_FLAGS; fEOLComments = TRUE; fMatchOpenParen = -1; @@ -239,15 +238,41 @@ // The pattern has now been read and processed, and the compiled code generated. // - // Back-reference fixup + // Back-reference and condition fixup // int32_t loc; for (loc=0; locfCompiledPat->size(); loc++) { int32_t op = fRXPat->fCompiledPat->elementAti(loc); int32_t opType = URX_TYPE(op); - if (opType == URX_BACKREF || opType == URX_BACKREF_I) { + + // Mutate named backrefs into numbered ones + if (opType == URX_NAMED_BACKREF || opType == URX_NMD_BACKREF_I || opType == URX_NAMED_CONDTN) { + int32_t nameIndex = URX_VAL(op); + int32_t where = fRXPat->fNamedGroupMap->elementAti(nameIndex); + + if (where < 0) { + error (U_REGEX_UNKNOWN_GROUP_NAME); + break; + } + + if (opType == URX_NAMED_BACKREF) { + opType = URX_BACKREF; + } else if (opType == URX_NMD_BACKREF_I) { + opType = URX_BACKREF_I; + } else if (opType == URX_NAMED_CONDTN) { + opType = URX_CONDITION; + } + op = URX_BUILD(opType, where + 1); + } + + if (opType == URX_BACKREF || opType == URX_BACKREF_I || opType == URX_CONDITION) { int32_t where = URX_VAL(op); if (where > fRXPat->fGroupMap->size()) { + // Perl allows conditional expressions with nonexistent numeric conditions :-( + if (opType == URX_CONDITION) { + fRXPat->fCompiledPat->setElementAt(URX_BUILD(URX_NOP, 0), loc); + break; + } error(U_REGEX_INVALID_BACK_REF); break; } @@ -380,8 +405,27 @@ // save from the location on the top of the parentheses stack. int32_t savePosition = fParenStack.popi(); int32_t op = fRXPat->fCompiledPat->elementAti(savePosition); - U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location - op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); + UBool isConditional = FALSE; + + // Check for condition expressions; '|' needs slightly different handling + // inside these. + if (fModeFlags & URX_INSIDE_CONDITIONAL) { + int32_t opType = URX_TYPE(op); + + if (fModeFlags & URX_GOT_COND_ELSE) { + error (U_REGEX_TOO_MANY_BRANCHES); + } + + fModeFlags |= URX_GOT_COND_ELSE; + + U_ASSERT(opType == URX_JMP || opType == URX_STATE_SAVE); + + op = URX_BUILD(opType, fRXPat->fCompiledPat->size() + 1); + } else { + U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location + op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size() + 1); + } + fRXPat->fCompiledPat->setElementAt(op, savePosition); // Append an JMP operation into the compiled pattern. The operand for @@ -394,15 +438,60 @@ // This registers if for fixup when this block's close paren is encountered. fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); - // Append a NOP to the compiled pattern. This is the slot reserved - // for a SAVE in the event that there is yet another '|' following - // this one. - fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); - fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); + if (!(fModeFlags & URX_INSIDE_CONDITIONAL)) { + // Append a NOP to the compiled pattern. This is the slot reserved + // for a SAVE in the event that there is yet another '|' following + // this one. + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); + fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); + } } break; + case doOpenPyNamedParen: + // Python-style named paren. + // Scan the name and add it to the named groups list, then proceed + // as normal + { + UnicodeString *name = scanName(); + + if (!name) { + break; + } + + if (name->length() == 0) { + error(U_REGEX_BAD_GROUP_NAME); + } + + int32_t nameCount = fRXPat->fNamedGroups->size(); + int32_t nameIndex; + for (nameIndex = 0; nameIndex < nameCount; ++nameIndex) { + UnicodeString *other = (UnicodeString *)fRXPat->fNamedGroups->elementAt(nameIndex); + + if (*other == *name) { + int32_t curGroup = fRXPat->fNamedGroupMap->elementAti(nameIndex); + if (curGroup < 0) { + fRXPat->fNamedGroupMap->setElementAt(fRXPat->fGroupMap->size(), + *fStatus); + break; + } else { + error(U_REGEX_DUPLICATE_GROUP_NAME); + break; + } + } + } + + if (nameIndex >= nameCount) { + fRXPat->fNamedGroups->addElement(name, *fStatus); + fRXPat->fNamedGroupMap->addElement(fRXPat->fGroupMap->size(), *fStatus); + } else { + delete name; + } + } + + // No break; fall through here + case doOpenCaptureParen: // Open Paren. // Compile to a @@ -438,6 +527,9 @@ fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc + // No longer inside a conditional (if we were) + fModeFlags &= ~URX_INSIDE_CONDITIONAL; + // Save the mapping from group number to stack frame variable position. fRXPat->fGroupMap->addElement(varsLoc, *fStatus); } @@ -460,8 +552,11 @@ fParenStack.push(plain, *fStatus); // Begin a new frame. fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc + + // No longer inside a conditional (if we were) + fModeFlags &= ~URX_INSIDE_CONDITIONAL; } - break; + break; case doOpenAtomicParen: @@ -488,9 +583,61 @@ fParenStack.push(atomic, *fStatus); // Frame type. fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP + + // No longer inside a conditional (if we were) + fModeFlags &= ~URX_INSIDE_CONDITIONAL; } break; + case doCondOpenLookAhead: + // Conditional expression using lookahead. + // + // Compiles to + // + // NOP reserved for quantifiers + // START_LA start lookahead + // SAVE_STATE points to the false-part + // NOP reserved for quantifiers on internal block (unused) + // NOP reserved for STATE_SAVE if we see an '|' in the condition + // pattern code the code implementing the pattern + // END_LA finish lookahead + // true-part the bit that gets run if the lookahead succeeds + // JMP to the very end + // false-part the bit that gets run if the lookahead fails + // + // This results in *two* entries on the paren stack. + { + int32_t nop = URX_BUILD(URX_NOP, 0); + + fRXPat->fCompiledPat->addElement(nop, *fStatus); + + int32_t dataLoc = fRXPat->fDataSize; + fRXPat->fDataSize += 2; + int32_t op = URX_BUILD(URX_LA_START, dataLoc); + fRXPat->fCompiledPat->addElement(op, *fStatus); + + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 0), *fStatus); + + fRXPat->fCompiledPat->addElement(nop, *fStatus); + fRXPat->fCompiledPat->addElement(nop, *fStatus); + + // On the paren stack, start a frame for the conditional expression + fParenStack.push(fModeFlags, *fStatus); + fParenStack.push(condWithMatch, *fStatus); + fParenStack.push(fRXPat->fCompiledPat->size() - 5, *fStatus); // The NOP location + fParenStack.push(fRXPat->fCompiledPat->size() - 3, *fStatus); // The SAVE_STATE location + + // Now push a frame for the lookahead expression itself + fParenStack.push((fModeFlags & ~URX_GOT_COND_ELSE) + | URX_INSIDE_CONDITIONAL, *fStatus); + fParenStack.push(lookAheadCE, *fStatus); // This is a lookahead test + fParenStack.push(fRXPat->fCompiledPat->size() - 2, *fStatus); // The first NOP + fParenStack.push(fRXPat->fCompiledPat->size() - 1, *fStatus); // The second NOP + + // The lookahead part isn't in the then-else part of the expression + fModeFlags &= ~URX_INSIDE_CONDITIONAL; + } + break; case doOpenLookAhead: // Positive Look-ahead (?= stuff ) @@ -520,6 +667,58 @@ fParenStack.push(lookAhead, *fStatus); // Frame type. fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location + + // No longer inside a conditional (if we were) + fModeFlags &= ~URX_INSIDE_CONDITIONAL; + } + break; + + case doCondOpenLookAheadNeg: + // Conditional expression using negated lookahead. + // + // Compiles to + // + // NOP reserved for quantifiers + // START_LA start lookahead + // SAVE_STATE points to the true-part + // NOP reserved for STATE_SAVE if we see an '|' in the cond. + // pattern code the code implementing the pattern + // END_LA finish lookahead + // JMP to false-part + // true-part the bit that gets run if the lookahead succeeds + // JMP to the very end + // false-part the bit that gets run if the lookahead fails + // + // This results in *two* entries on the paren stack. + { + int32_t nop = URX_BUILD(URX_NOP, 0); + + fRXPat->fCompiledPat->addElement(nop, *fStatus); + + int32_t dataLoc = fRXPat->fDataSize; + fRXPat->fDataSize += 2; + int32_t op = URX_BUILD(URX_LA_START, dataLoc); + fRXPat->fCompiledPat->addElement(op, *fStatus); + + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 0), *fStatus); + + fRXPat->fCompiledPat->addElement(nop, *fStatus); + + // On the paren stack, start a frame for the conditional expression + fParenStack.push(fModeFlags, *fStatus); + fParenStack.push(condWithMatch, *fStatus); + fParenStack.push(fRXPat->fCompiledPat->size() - 4, *fStatus); // The NOP location + fParenStack.push(0, *fStatus); // Dummy, will point to JMP to false-part later + + // Now push a frame for the lookahead expression itself + fParenStack.push((fModeFlags & ~URX_GOT_COND_ELSE) + | URX_INSIDE_CONDITIONAL, *fStatus); + fParenStack.push(negLookAheadCE, *fStatus); // This is a !lookahead test + fParenStack.push(fRXPat->fCompiledPat->size() - 2, *fStatus); // The SAVE_STATE location + fParenStack.push(fRXPat->fCompiledPat->size() - 1, *fStatus); // The second NOP + + // The lookahead part isn't in the then-else part of the expression + fModeFlags &= ~URX_INSIDE_CONDITIONAL; } break; @@ -557,6 +756,69 @@ } break; + case doCondOpenLookBehind: + // Conditional expression using lookbehind. + // + // Compiles to + // + // NOP reserved for quantifiers + // LB_START start lookbehind + // STATE_SAVE points to the false-part + // LB_CONT continue from here + // - holds min match length + // - holds max match length + // NOP reserved for quantifiers on the internal block + // NOP reserved for STATE_SAVE if we see an '|' in the condition + // pattern code code for the lookbehind expression + // LB_END check match length, restore input len + // LA_END restore stack and input pos (also cancels STATE_SAVE) + // true-part the bit that gets run if the lookbehind succeeds + // JMP to the very end + // false-part the bit that gets run if the lookbehind fails + // + // This results in *two* entries on the paren stack. + { + int32_t nop = URX_BUILD(URX_NOP, 0); + + // Emit the instructions + fRXPat->fCompiledPat->addElement(nop, *fStatus); + + int32_t dataLoc = fRXPat->fDataSize; + fRXPat->fDataSize += 4; + + int32_t op = URX_BUILD(URX_LB_START, dataLoc); + fRXPat->fCompiledPat->addElement(op, *fStatus); + + op = URX_BUILD(URX_STATE_SAVE, 0); + fRXPat->fCompiledPat->addElement(op, *fStatus); + + op = URX_BUILD(URX_LB_CONT, dataLoc); + fRXPat->fCompiledPat->addElement(op, *fStatus); + fRXPat->fCompiledPat->addElement(0, *fStatus); // min match len; filled in later + fRXPat->fCompiledPat->addElement(0, *fStatus); // max match len; filled in later + + fRXPat->fCompiledPat->addElement(nop, *fStatus); + fRXPat->fCompiledPat->addElement(nop, *fStatus); + + // Now build a frame for the conditional expression + fParenStack.push(fModeFlags, *fStatus); + fParenStack.push(condWithMatch, *fStatus); + fParenStack.push(fRXPat->fCompiledPat->size() - 8, *fStatus); // The NOP + fParenStack.push(fRXPat->fCompiledPat->size() - 6, *fStatus); // The SAVE_STATE + + // Now push a frame for the lookbehind itself + fParenStack.push((fModeFlags & ~URX_GOT_COND_ELSE) + | URX_INSIDE_CONDITIONAL, *fStatus); + fParenStack.push(lookBehindCE, *fStatus); + fParenStack.push(fRXPat->fCompiledPat->size() - 2, *fStatus); // The first NOP + fParenStack.push(fRXPat->fCompiledPat->size() - 1, *fStatus); // The second NOP + + // The lookbehind part isn't in the then-else part of the expression + fModeFlags &= ~URX_INSIDE_CONDITIONAL; + } + break; + + case doOpenLookBehind: { // Compile a (?<= look-behind open paren. @@ -605,10 +867,75 @@ fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location // The final two instructions will be added when the ')' is encountered. + + // No longer inside a conditional (if we were) + fModeFlags &= ~URX_INSIDE_CONDITIONAL; } break; + case doCondOpenLookBehindNeg: + // Conditional expression using negated lookbehind. + // + // Compiles to + // + // NOP reserved for quantifiers + // STATE_SAVE points to the false-part + // LB_START start lookbehind + // LBN_CONT continue from here + // - holds min match length + // - holds max match length + // - location to jump to on FAIL (points at STATE_POP) + // NOP reserved for quantifiers on the internal block + // NOP reserved for STATE_SAVE if we see a '|' in the condition + // pattern code code for the lookbehind expression + // LBN_END check match length, then FAIL + // STATE_POP pop the state we saved above + // true-part the bit that gets run if the neg. lookbehind succeeds + // JMP to the very end + // false-part the bit that gets run if the lookbehind fails + { + int32_t nop = URX_BUILD(URX_NOP, 0); + + // Emit the instructions + fRXPat->fCompiledPat->addElement(nop, *fStatus); + + int32_t dataLoc = fRXPat->fDataSize; + fRXPat->fDataSize += 4; + + int32_t op = URX_BUILD(URX_STATE_SAVE, 0); + fRXPat->fCompiledPat->addElement(op, *fStatus); + + op = URX_BUILD(URX_LB_START, dataLoc); + fRXPat->fCompiledPat->addElement(op, *fStatus); + + op = URX_BUILD(URX_LBN_CONT, dataLoc); + fRXPat->fCompiledPat->addElement(op, *fStatus); + fRXPat->fCompiledPat->addElement(0, *fStatus); // min length, filled in later + fRXPat->fCompiledPat->addElement(0, *fStatus); // max length, filled in later + fRXPat->fCompiledPat->addElement(0, *fStatus); // continue loc., filled in later + + fRXPat->fCompiledPat->addElement(nop, *fStatus); + fRXPat->fCompiledPat->addElement(nop, *fStatus); + + // On the paren stack, start a frame for the conditional expression + fParenStack.push(fModeFlags, *fStatus); + fParenStack.push(condWithMatch, *fStatus); + fParenStack.push(fRXPat->fCompiledPat->size() - 9, *fStatus); // The NOP location + fParenStack.push(fRXPat->fCompiledPat->size() - 8, *fStatus); // The STATE_SAVE location + + // Now push a frame for the lookbehind expressioqn itself + fParenStack.push((fModeFlags & ~URX_GOT_COND_ELSE) + | URX_INSIDE_CONDITIONAL, *fStatus); + fParenStack.push(lookBehindNCE, *fStatus); + fParenStack.push(fRXPat->fCompiledPat->size() - 2, *fStatus); // The first NOP location + fParenStack.push(fRXPat->fCompiledPat->size() - 1, *fStatus); // The second NOP location + + // The lookbehind part isn't in the then-else part of the expression + fModeFlags &= ~URX_INSIDE_CONDITIONAL; + } + break; + case doOpenLookBehindNeg: { // Compile a (?fCompiledPat->size()-1, *fStatus); // The 2nd NOP location // The final two instructions will be added when the ')' is encountered. + + // No longer inside a conditional (if we were) + fModeFlags &= ~URX_INSIDE_CONDITIONAL; } break; case doConditionalExpr: - // Conditionals such as (?(1)a:b) + // Conditionals such as (?(1)a|b). This handles the simple cases where we just want to test + // for a match with a capture group. + // + // Compiles to + // + // NOP reserved for quantifiers + // CONDITION skip the next instruction if the condition is true + // JMP goes to the false-branch part or the end if there is no false-branch + // true-branch + // JMP gote to the end part + // false-branch + // + { + UnicodeString *name = scanName(TRUE); + + if (!name) { + break; + } + + int32_t groupNum = 0; + int32_t nameLen = name->length(); + int32_t numCaptureGroups = fRXPat->fGroupMap->size(); + int32_t idx; + // Scan things that look like group numbers as a group number + for (idx = 0; idx < nameLen;) { + UChar32 c = name->char32At(idx); + int32_t digit = u_charDigitValue(c); + + if (digit < 0) { + groupNum = -1; + break; + } + + groupNum = groupNum * 10 + digit; + + if (RegexStaticSets::gStaticSets->fRuleDigits->contains(c) == FALSE) { + groupNum = -1; + break; + } + + // Move to the next code point + idx = name->moveIndex32(idx, 1); + } + + // Add a NOP so that the '*' operator can replace it later if necessary + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); + + // If we get here with groupNum set to -1, we assume that we have a name + if (groupNum < 0) { + int32_t nameIndex; + int32_t nameCount = fRXPat->fNamedGroups->size (); + for (nameIndex = 0; nameIndex < nameCount; ++nameIndex) { + UnicodeString *otherName = (UnicodeString *)fRXPat->fNamedGroups->elementAt(nameIndex); + + if (*otherName == *name) + break; + } + + if (nameIndex == nameCount) { + fRXPat->fNamedGroups->addElement(name, *fStatus); + fRXPat->fNamedGroupMap->addElement(-1, *fStatus); + } else { + delete name; + } + + // Add a URX_NAMED_CONDTN instruction; this will be replaced later + int32_t op = URX_BUILD(URX_NAMED_CONDTN, nameIndex); + fRXPat->fCompiledPat->addElement(op, *fStatus); + } else { + if (groupNum == 0) { + error(U_REGEX_INVALID_BACK_REF); + } + + int32_t op = URX_BUILD(URX_CONDITION, groupNum); + fRXPat->fCompiledPat->addElement(op, *fStatus); + } + + // If URX_CONDITION succeeds, it skips the next instruction; this jump either + // points to the thing after the '|', or to the very end of the conditional expr + fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 0), *fStatus); + + // Push a frame onto the paren stack + fParenStack.push(fModeFlags, *fStatus); // Match mode state + fParenStack.push(conditional, *fStatus); // Begin a new frame + fParenStack.push(fRXPat->fCompiledPat->size() - 3, *fStatus); // The first NOP location + fParenStack.push(fRXPat->fCompiledPat->size() - 1, *fStatus); // The JMP location + + // We're now inside a conditional expression + fModeFlags = (fModeFlags & ~URX_GOT_COND_ELSE) | URX_INSIDE_CONDITIONAL; + } + break; + + + case doUnimplCondition: + // Unimplemented conditional match types + case doPerlInline: // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. error(U_REGEX_UNIMPLEMENTED); @@ -1183,12 +1608,18 @@ nextCharLL(); } + // If the user specifies \0, it's an invalid back-reference + if (groupNum == 0) { + error(U_REGEX_INVALID_BACK_REF); + } else { + U_ASSERT(groupNum > 0); + } + // Scan of the back reference in the source regexp is complete. Now generate // the compiled code for it. // Because capture groups can be forward-referenced by back-references, // we fill the operand with the capture group number. At the end // of compilation, it will be changed to the variable's location. - U_ASSERT(groupNum > 0); int32_t op; if (fModeFlags & UREGEX_CASE_INSENSITIVE) { op = URX_BUILD(URX_BACKREF_I, groupNum); @@ -1199,6 +1630,47 @@ } break; + case doPyNamedBackRef: + // Named backreference. + { + // Find the name in the name table. We must have seen the name + // already at this point, otherwise using it is invalid. + UnicodeString *name = scanName(); + + if (!name) { + break; + } + + if (name->length() == 0) { + error(U_REGEX_BAD_GROUP_NAME); + break; + } + + int32_t nameIndex; + int32_t nameCount = fRXPat->fNamedGroups->size (); + for (nameIndex = 0; nameIndex < nameCount; ++nameIndex) { + UnicodeString *otherName = (UnicodeString *)fRXPat->fNamedGroups->elementAt(nameIndex); + + if (*otherName == *name) + break; + } + + if (nameIndex == nameCount) { + fRXPat->fNamedGroups->addElement(name, *fStatus); + fRXPat->fNamedGroupMap->addElement(-1, *fStatus); + } else { + delete name; + } + + int32_t op; + if (fModeFlags & UREGEX_CASE_INSENSITIVE) { + op = URX_BUILD(URX_NMD_BACKREF_I, nameIndex); + } else { + op = URX_BUILD(URX_NAMED_BACKREF, nameIndex); + } + fRXPat->fCompiledPat->addElement(op, *fStatus); + } + break; case doPossessivePlus: // Possessive ++ quantifier. @@ -1332,8 +1804,10 @@ case doSetMatchMode: // We've got a (?i) or similar. The match mode is being changed, but - // the change is not scoped to a parenthesized block. - fModeFlags = fNewModeFlags; + // the change is not scoped to a parenthesized block. Make sure we keep our + // special state flags. + fModeFlags = ((fNewModeFlags & URX_USER_MODE_FLAGS) + | (fModeFlags & URX_RESERVED_MODE_FLAGS)); // Prevent any string from spanning across the change of match mode. // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef" @@ -1363,7 +1837,7 @@ fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP // Set the current mode flags to the new values. - fModeFlags = fNewModeFlags; + fModeFlags = fNewModeFlags & ~URX_INSIDE_CONDITIONAL; } break; @@ -1733,11 +2207,12 @@ switch (patIdx) { case plain: case flags: + case conditional: // No additional fixups required. // (Grouping-only parentheses) break; case capturing: - // Capturing Parentheses. + // Capturing Parentheses (possibly named). // Insert a End Capture op into the pattern. // The frame offset of the variables for this cg is obtained from the // start capture op and put it into the end-capture op. @@ -1763,9 +2238,11 @@ } break; + case lookAheadCE: case lookAhead: { - int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); + int32_t startOfs = patIdx == lookAhead ? 1 : 2; + int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen - startOfs); U_ASSERT(URX_TYPE(startOp) == URX_LA_START); int32_t dataLoc = URX_VAL(startOp); int32_t op = URX_BUILD(URX_LA_END, dataLoc); @@ -1773,6 +2250,7 @@ } break; + case negLookAheadCE: case negLookAhead: { // See comment at doOpenLookAheadNeg @@ -1781,8 +2259,18 @@ int32_t dataLoc = URX_VAL(startOp); int32_t op = URX_BUILD(URX_LA_END, dataLoc); fRXPat->fCompiledPat->addElement(op, *fStatus); - op = URX_BUILD(URX_FAIL, 0); - fRXPat->fCompiledPat->addElement(op, *fStatus); + + if (patIdx == negLookAhead) { + op = URX_BUILD(URX_FAIL, 0); + fRXPat->fCompiledPat->addElement(op, *fStatus); + } else { + // Build a jump to the false-part and set the field of the conditional's + // stack frame to point at it so it'll get fixed up. + op = URX_BUILD(URX_JMP, 0); + fRXPat->fCompiledPat->addElement(op, *fStatus); + fParenStack.setElementAt(fRXPat->fCompiledPat->size() - 1, + fParenStack.size() - 1); + } // Patch the URX_SAVE near the top of the block. int32_t saveOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen); @@ -1793,12 +2281,14 @@ } break; + case lookBehindCE: case lookBehind: { // See comment at doOpenLookBehind. // Append the URX_LB_END and URX_LA_END to the compiled pattern. - int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); + int32_t startOfs = patIdx == lookBehind ? 4 : 5; + int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen - startOfs); U_ASSERT(URX_TYPE(startOp) == URX_LB_START); int32_t dataLoc = URX_VAL(startOp); int32_t op = URX_BUILD(URX_LB_END, dataLoc); @@ -1826,14 +2316,13 @@ } break; - - + case lookBehindNCE: case lookBehindN: { // See comment at doOpenLookBehindNeg. // Append the URX_LBN_END to the compiled pattern. - int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); + int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 5); U_ASSERT(URX_TYPE(startOp) == URX_LB_START); int32_t dataLoc = URX_VAL(startOp); int32_t op = URX_BUILD(URX_LBN_END, dataLoc); @@ -1860,6 +2349,13 @@ // as the last operand of the URX_LBN_CONT op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); + + if (patIdx == lookBehindNCE) { + // For conditional expressions, append a STATE_POP to remove the extra + // stack frame. + op = URX_BUILD(URX_STATE_POP, 0); + fRXPat->fCompiledPat->addElement(op, *fStatus); + } } break; @@ -2108,6 +2604,7 @@ case URX_END: case URX_STRING_LEN: case URX_NOP: + case URX_STATE_POP: case URX_START_CAPTURE: case URX_END_CAPTURE: case URX_BACKSLASH_B: @@ -2270,6 +2767,17 @@ atStart = FALSE; break; + case URX_CONDITION: + { + int32_t skipDest = loc + 2; + + U_ASSERT(skipDest <= end + 1); + // Propagate the current min length to the target loc of the skip + if (forwardedLength.elementAti(skipDest) > currentLen) { + forwardedLength.setElementAt(currentLen, skipDest); + } + } + break; case URX_JMPX: loc++; // Except for extra operand on URX_JMPX, same as URX_JMP. @@ -2579,6 +3087,7 @@ case URX_END: case URX_STRING_LEN: case URX_NOP: + case URX_STATE_POP: case URX_START_CAPTURE: case URX_END_CAPTURE: case URX_BACKSLASH_B: @@ -2619,6 +3128,17 @@ currentLen++; break; + case URX_CONDITION: + { + int32_t skipDest = loc + 2; + + U_ASSERT(skipDest <= end + 1); + // Propagate the current min length to the target loc of the skip + if (forwardedLength.elementAti(skipDest) > currentLen) { + forwardedLength.setElementAt(currentLen, skipDest); + } + } + break; case URX_JMPX: loc++; // URX_JMPX has an extra operand, ignored here, @@ -2819,6 +3339,7 @@ case URX_END: case URX_STRING_LEN: case URX_NOP: + case URX_STATE_POP: case URX_START_CAPTURE: case URX_END_CAPTURE: case URX_BACKSLASH_B: @@ -2876,6 +3397,22 @@ } break; + // Conditional skip operation. A bit like a jump. + // + case URX_CONDITION: + { + int32_t skipDest = loc + 2; + + U_ASSERT(skipDest <= end + 1); + // Propagate the current max length + if (forwardedLength.elementAti(skipDest) < currentLen) { + forwardedLength.setElementAt(currentLen, skipDest); + } + // N.B. Don't reset currentLen here; if the condition isn't met, we will continue + // matching at the next instruction. + } + break; + // Jumps. // case URX_JMP: @@ -2888,7 +3425,7 @@ // Loop of some kind. Max match length is unbounded. currentLen = INT32_MAX; } else { - // Forward jump. Propagate the current min length to the target loc of the jump. + // Forward jump. Propagate the current max length to the target loc of the jump. if (forwardedLength.elementAti(jmpDest) < currentLen) { forwardedLength.setElementAt(currentLen, jmpDest); } @@ -3061,6 +3598,8 @@ break; } + case URX_STATE_POP: + case URX_CONDITION: case URX_RESERVED_OP: case URX_RESERVED_OP_N: case URX_BACKTRACK: @@ -3247,7 +3786,9 @@ static const UChar chUpperN = 0x4E; static const UChar chLowerP = 0x70; static const UChar chUpperP = 0x50; - +static const UChar chLAngle = 0x3c; // '<' +static const UChar chEquals = 0x3d; // '=' +static const UChar chRAngle = 0x3e; // '>' //------------------------------------------------------------------------------ // @@ -3527,6 +4068,72 @@ return uset; } + +//------------------------------------------------------------------------------ +// +// scanName Scan the name for a named group, e.g. (?Pfoo), a named +// backreference, e.g. (?P=name). +// +// Case 1: Named group +// The scan position will be at the '<'. On return, the scan +// position should be just after the '>'. +// +// Case 2: Named backreference +// The scan position will be at the '='. On return the scan +// position should be just after the ')'. +// +// Case 3: Conditional reference +// The scan position will be *after* the '('. On return, the scan +// position should be just after the ')'. +// +// Return a UnicodeString containing the name, or NULL if the +// pattern is invalid. +// +//------------------------------------------------------------------------------ +UnicodeString *RegexCompile::scanName(UBool isCondition) +{ + UnicodeString *ustr = NULL; + UChar32 chTerm; + + if (U_FAILURE(*fStatus)) { + return NULL; + } + + if (!isCondition) { + U_ASSERT(fC.fChar == chLAngle || fC.fChar == chEquals); + + if (fC.fChar == chLAngle) + chTerm = chRAngle; + else + chTerm = chRParen; + + nextChar(fC); + } else { + chTerm = chRParen; + } + + UnicodeString *name = new UnicodeString(); + UChar32 ch; + if (!name) { + error(U_MEMORY_ALLOCATION_ERROR); + return NULL; + } + do { + ch = fC.fChar; + if (ch == -1) { + // Hit the end of the input string without finding the closing + // character + error(U_REGEX_UNTERMINATED_GROUP_NAME); + return NULL; + } + if (ch != chTerm) + name->append (ch); + nextChar(fC); + } while (ch != chTerm); + + return name; +} + U_NAMESPACE_END #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS diff -u -r icu/source/i18n/regexcmp.h icu.new/source/i18n/regexcmp.h --- icu/source/i18n/regexcmp.h 2003-04-03 00:10:16.000000000 +0100 +++ icu.new/source/i18n/regexcmp.h 2006-08-10 00:20:43.000000000 +0100 @@ -28,6 +28,17 @@ U_NAMESPACE_BEGIN +// +// Special mode flags, used during compilation only +// +enum { + URX_USER_MODE_FLAGS = 0x00ffffff, + URX_RESERVED_MODE_FLAGS = ~URX_USER_MODE_FLAGS, + + URX_INSIDE_CONDITIONAL = 0x80000000, // If set, we're inside a conditional expression + URX_GOT_COND_ELSE = 0x40000000, // If set, we've already seen the '|' +}; + //-------------------------------------------------------------------------------- // // class RegexCompile Contains the regular expression compiler. @@ -70,14 +81,22 @@ // The category is saved in the compile-time parentheses stack frame, and // determines the code to be generated when the matching close ) is encountered. enum EParenClass { - plain = -1, // No special handling - capturing = -2, - atomic = -3, - lookAhead = -4, - negLookAhead = -5, - flags = -6, - lookBehind = -7, - lookBehindN = -8 + plain = -1, // No special handling + capturing = -2, + atomic = -3, + lookAhead = -4, + negLookAhead = -5, + flags = -6, + lookBehind = -7, + lookBehindN = -8, + conditional = -9, + condWithMatch = -10, // A conditional that uses a lookahead/lookbehind + + // Used when these appear in the test part of a conditional + lookAheadCE = -11, + negLookAheadCE = -12, + lookBehindCE = -13, + lookBehindNCE = -14 }; private: @@ -90,6 +109,7 @@ UChar32 peekCharLL(); UnicodeSet *scanSet(); UnicodeSet *scanProp(); + UnicodeString *scanName(UBool isCondition = FALSE); void handleCloseParen(); int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern // at the top of the just completed block @@ -154,6 +174,8 @@ // Data associated with the generation of the pcode for the match engine // int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) + // This also gets used to hold some state info + // for conditional exprs (in the high byte) int32_t fNewModeFlags; // New flags, while compiling (?i, holds state // until last flag is scanned. UBool fSetModeFlag; // true for (?ismx, false for (?-ismx @@ -175,7 +197,6 @@ // the kind of paren that opened the frame. Some // need special handling on close. - int32_t fMatchOpenParen; // The position in the compiled pattern // of the slot reserved for a state save // at the start of the most recently processed diff -u -r icu/source/i18n/regexcst.h icu.new/source/i18n/regexcst.h --- icu/source/i18n/regexcst.h 2003-11-08 02:01:42.000000000 +0000 +++ icu.new/source/i18n/regexcst.h 2006-08-10 15:25:15.000000000 +0100 @@ -16,75 +16,82 @@ // // Character classes for regex pattern scanning. // - static const uint8_t kRuleSet_digit_char = 128; - static const uint8_t kRuleSet_white_space = 129; + static const uint8_t kRuleSet_white_space = 128; + static const uint8_t kRuleSet_digit_char = 129; static const uint8_t kRuleSet_rule_char = 130; enum Regex_PatternParseAction { - doPossessivePlus, - doCloseParen, - doProperty, - doBeginMatchMode, - doOrOperator, - doOpenCaptureParen, - doBadOpenParenType, - doRuleError, + doLiteralChar, + doBackslashA, + doNOP, + doBackslashG, + doPerlInline, + doPyNamedBackRef, doIntevalLowerDigit, - doBackslashs, - doNGOpt, - doBackslashw, - doMismatchedParenErr, - doOpenLookBehind, - doBackslashz, + doProperty, + doBackslashX, + doOpenAtomicParen, + doOpenPyNamedParen, + doPatFinish, + doUnimplCondition, + doNGPlus, + doOpenLookBehindNeg, doIntervalError, - doStar, - doCaret, - doEnterQuoteMode, - doNGStar, - doMatchMode, - doIntervalUpperDigit, - doOpenLookAheadNeg, + doIntervalSame, + doBackRef, doPlus, + doOpenCaptureParen, + doMismatchedParenErr, + doBeginMatchMode, + doEscapeError, doOpenNonCaptureParen, - doBackslashA, - doBackslashB, - doNGPlus, + doDollar, + doIntervalUpperDigit, + doBackslashs, doSetMatchMode, - doPatFinish, - doBackslashD, - doPossessiveInterval, - doEscapeError, - doBackslashG, - doSuppressComments, + doOpenLookBehind, doMatchModeParen, + doOrOperator, + doCaret, + doStar, doOpt, - doInterval, - doLiteralChar, - doIntervalInit, - doOpenAtomicParen, - doBackslashS, - doOpenLookAhead, - doBackRef, - doDollar, - doDotAny, + doMatchMode, + doSuppressComments, + doPossessiveInterval, + doOpenLookAheadNeg, doBackslashW, - doBackslashX, + doCloseParen, + doIntervalInit, doScanUnicodeSet, - doBackslashZ, - doPerlInline, + doNGStar, + doEnterQuoteMode, + doBackslashB, + doBackslashw, doPossessiveOpt, - doNOP, + doRuleError, + doBackslashb, + doCondOpenLookBehindNeg, doConditionalExpr, - doExit, + doCondOpenLookAheadNeg, + doPossessivePlus, + doBadOpenParenType, doNGInterval, + doBackslashd, + doBackslashD, + doCondOpenLookBehind, + doExit, + doInterval, + doNGOpt, + doBackslashZ, + doBackslashS, + doOpenLookAhead, doPatStart, doBadModeFlag, - doBackslashb, doPossessiveStar, - doBackslashd, - doIntervalSame, - doOpenLookBehindNeg, + doBackslashz, + doCondOpenLookAhead, + doDotAny, rbbiLastAction}; //------------------------------------------------------------------------------- @@ -112,15 +119,15 @@ , {doDotAny, 46 /* . */, 14,0, TRUE} // 6 , {doCaret, 94 /* ^ */, 2,0, TRUE} // 7 , {doDollar, 36 /* $ */, 2,0, TRUE} // 8 - , {doNOP, 92 /* \ */, 81,0, TRUE} // 9 + , {doNOP, 92 /* \ */, 93,0, TRUE} // 9 , {doOrOperator, 124 /* | */, 2,0, TRUE} // 10 , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11 , {doPatFinish, 253, 2,0, FALSE} // 12 - , {doRuleError, 255, 101,0, FALSE} // 13 - , {doNOP, 42 /* * */, 59,0, TRUE} // 14 expr-quant - , {doNOP, 43 /* + */, 62,0, TRUE} // 15 - , {doNOP, 63 /* ? */, 65,0, TRUE} // 16 - , {doIntervalInit, 123 /* { */, 68,0, TRUE} // 17 + , {doRuleError, 255, 113,0, FALSE} // 13 + , {doNOP, 42 /* * */, 71,0, TRUE} // 14 expr-quant + , {doNOP, 43 /* + */, 74,0, TRUE} // 15 + , {doNOP, 63 /* ? */, 77,0, TRUE} // 16 + , {doIntervalInit, 123 /* { */, 80,0, TRUE} // 17 , {doNOP, 40 /* ( */, 23,0, TRUE} // 18 , {doNOP, 255, 20,0, FALSE} // 19 , {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont @@ -128,7 +135,7 @@ , {doNOP, 255, 2,0, FALSE} // 22 , {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant , {doNOP, 255, 27,0, FALSE} // 24 - , {doNOP, 35 /* # */, 47, 14, TRUE} // 25 open-paren-quant2 + , {doNOP, 35 /* # */, 59, 14, TRUE} // 25 open-paren-quant2 , {doNOP, 255, 29,0, FALSE} // 26 , {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren , {doOpenCaptureParen, 255, 2, 14, FALSE} // 28 @@ -136,75 +143,87 @@ , {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30 , {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31 , {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32 - , {doNOP, 60 /* < */, 44,0, TRUE} // 33 - , {doNOP, 35 /* # */, 47, 2, TRUE} // 34 - , {doBeginMatchMode, 105 /* i */, 50,0, FALSE} // 35 - , {doBeginMatchMode, 109 /* m */, 50,0, FALSE} // 36 - , {doBeginMatchMode, 115 /* s */, 50,0, FALSE} // 37 - , {doBeginMatchMode, 119 /* w */, 50,0, FALSE} // 38 - , {doBeginMatchMode, 120 /* x */, 50,0, FALSE} // 39 - , {doBeginMatchMode, 45 /* - */, 50,0, FALSE} // 40 - , {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 41 - , {doPerlInline, 123 /* { */, 101,0, TRUE} // 42 - , {doBadOpenParenType, 255, 101,0, FALSE} // 43 - , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 44 open-paren-lookbehind - , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 45 - , {doBadOpenParenType, 255, 101,0, FALSE} // 46 - , {doNOP, 41 /* ) */, 255,0, TRUE} // 47 paren-comment - , {doMismatchedParenErr, 253, 101,0, FALSE} // 48 - , {doNOP, 255, 47,0, TRUE} // 49 - , {doMatchMode, 105 /* i */, 50,0, TRUE} // 50 paren-flag - , {doMatchMode, 109 /* m */, 50,0, TRUE} // 51 - , {doMatchMode, 115 /* s */, 50,0, TRUE} // 52 - , {doMatchMode, 119 /* w */, 50,0, TRUE} // 53 - , {doMatchMode, 120 /* x */, 50,0, TRUE} // 54 - , {doMatchMode, 45 /* - */, 50,0, TRUE} // 55 - , {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 56 - , {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 57 - , {doBadModeFlag, 255, 101,0, FALSE} // 58 - , {doNGStar, 63 /* ? */, 20,0, TRUE} // 59 quant-star - , {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 60 - , {doStar, 255, 20,0, FALSE} // 61 - , {doNGPlus, 63 /* ? */, 20,0, TRUE} // 62 quant-plus - , {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 63 - , {doPlus, 255, 20,0, FALSE} // 64 - , {doNGOpt, 63 /* ? */, 20,0, TRUE} // 65 quant-opt - , {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 66 - , {doOpt, 255, 20,0, FALSE} // 67 - , {doNOP, 129, 68,0, TRUE} // 68 interval-open - , {doNOP, 128, 71,0, FALSE} // 69 - , {doIntervalError, 255, 101,0, FALSE} // 70 - , {doIntevalLowerDigit, 128, 71,0, TRUE} // 71 interval-lower - , {doNOP, 44 /* , */, 75,0, TRUE} // 72 - , {doIntervalSame, 125 /* } */, 78,0, TRUE} // 73 - , {doIntervalError, 255, 101,0, FALSE} // 74 - , {doIntervalUpperDigit, 128, 75,0, TRUE} // 75 interval-upper - , {doNOP, 125 /* } */, 78,0, TRUE} // 76 - , {doIntervalError, 255, 101,0, FALSE} // 77 - , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 78 interval-type - , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 79 - , {doInterval, 255, 20,0, FALSE} // 80 - , {doBackslashA, 65 /* A */, 2,0, TRUE} // 81 backslash - , {doBackslashB, 66 /* B */, 2,0, TRUE} // 82 - , {doBackslashb, 98 /* b */, 2,0, TRUE} // 83 - , {doBackslashd, 100 /* d */, 14,0, TRUE} // 84 - , {doBackslashD, 68 /* D */, 14,0, TRUE} // 85 - , {doBackslashG, 71 /* G */, 2,0, TRUE} // 86 - , {doProperty, 78 /* N */, 14,0, FALSE} // 87 - , {doProperty, 112 /* p */, 14,0, FALSE} // 88 - , {doProperty, 80 /* P */, 14,0, FALSE} // 89 - , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 90 - , {doBackslashS, 83 /* S */, 14,0, TRUE} // 91 - , {doBackslashs, 115 /* s */, 14,0, TRUE} // 92 - , {doBackslashW, 87 /* W */, 14,0, TRUE} // 93 - , {doBackslashw, 119 /* w */, 14,0, TRUE} // 94 - , {doBackslashX, 88 /* X */, 14,0, TRUE} // 95 - , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 96 - , {doBackslashz, 122 /* z */, 2,0, TRUE} // 97 - , {doBackRef, 128, 14,0, TRUE} // 98 - , {doEscapeError, 253, 101,0, FALSE} // 99 - , {doLiteralChar, 255, 14,0, TRUE} // 100 - , {doExit, 255, 101,0, TRUE} // 101 errorDeath + , {doNOP, 60 /* < */, 45,0, TRUE} // 33 + , {doNOP, 35 /* # */, 59, 2, TRUE} // 34 + , {doBeginMatchMode, 105 /* i */, 62,0, FALSE} // 35 + , {doBeginMatchMode, 109 /* m */, 62,0, FALSE} // 36 + , {doBeginMatchMode, 115 /* s */, 62,0, FALSE} // 37 + , {doBeginMatchMode, 119 /* w */, 62,0, FALSE} // 38 + , {doBeginMatchMode, 120 /* x */, 62,0, FALSE} // 39 + , {doBeginMatchMode, 45 /* - */, 62,0, FALSE} // 40 + , {doNOP, 40 /* ( */, 48,0, TRUE} // 41 + , {doPerlInline, 123 /* { */, 113,0, TRUE} // 42 + , {doNOP, 80 /* P */, 57,0, TRUE} // 43 + , {doBadOpenParenType, 255, 113,0, FALSE} // 44 + , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 45 open-paren-lookbehind + , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 46 + , {doBadOpenParenType, 255, 113,0, FALSE} // 47 + , {doNOP, 63 /* ? */, 50, 14, TRUE} // 48 open-paren-cond + , {doConditionalExpr, 255, 2, 14, FALSE} // 49 + , {doCondOpenLookAhead, 61 /* = */, 2, 2, TRUE} // 50 open-paren-cond-ext + , {doCondOpenLookAheadNeg, 33 /* ! */, 2, 2, TRUE} // 51 + , {doNOP, 60 /* < */, 54,0, TRUE} // 52 + , {doUnimplCondition, 255, 113,0, FALSE} // 53 + , {doCondOpenLookBehind, 61 /* = */, 2, 2, TRUE} // 54 open-paren-cond-lb + , {doCondOpenLookBehindNeg, 33 /* ! */, 2, 2, TRUE} // 55 + , {doUnimplCondition, 255, 113,0, FALSE} // 56 + , {doOpenPyNamedParen, 60 /* < */, 2, 14, FALSE} // 57 paren-python + , {doPyNamedBackRef, 61 /* = */, 14,0, FALSE} // 58 + , {doNOP, 41 /* ) */, 255,0, TRUE} // 59 paren-comment + , {doMismatchedParenErr, 253, 113,0, FALSE} // 60 + , {doNOP, 255, 59,0, TRUE} // 61 + , {doMatchMode, 105 /* i */, 62,0, TRUE} // 62 paren-flag + , {doMatchMode, 109 /* m */, 62,0, TRUE} // 63 + , {doMatchMode, 115 /* s */, 62,0, TRUE} // 64 + , {doMatchMode, 119 /* w */, 62,0, TRUE} // 65 + , {doMatchMode, 120 /* x */, 62,0, TRUE} // 66 + , {doMatchMode, 45 /* - */, 62,0, TRUE} // 67 + , {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 68 + , {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 69 + , {doBadModeFlag, 255, 113,0, FALSE} // 70 + , {doNGStar, 63 /* ? */, 20,0, TRUE} // 71 quant-star + , {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 72 + , {doStar, 255, 20,0, FALSE} // 73 + , {doNGPlus, 63 /* ? */, 20,0, TRUE} // 74 quant-plus + , {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 75 + , {doPlus, 255, 20,0, FALSE} // 76 + , {doNGOpt, 63 /* ? */, 20,0, TRUE} // 77 quant-opt + , {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 78 + , {doOpt, 255, 20,0, FALSE} // 79 + , {doNOP, 128, 80,0, TRUE} // 80 interval-open + , {doNOP, 129, 83,0, FALSE} // 81 + , {doIntervalError, 255, 113,0, FALSE} // 82 + , {doIntevalLowerDigit, 129, 83,0, TRUE} // 83 interval-lower + , {doNOP, 44 /* , */, 87,0, TRUE} // 84 + , {doIntervalSame, 125 /* } */, 90,0, TRUE} // 85 + , {doIntervalError, 255, 113,0, FALSE} // 86 + , {doIntervalUpperDigit, 129, 87,0, TRUE} // 87 interval-upper + , {doNOP, 125 /* } */, 90,0, TRUE} // 88 + , {doIntervalError, 255, 113,0, FALSE} // 89 + , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 90 interval-type + , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 91 + , {doInterval, 255, 20,0, FALSE} // 92 + , {doBackslashA, 65 /* A */, 2,0, TRUE} // 93 backslash + , {doBackslashB, 66 /* B */, 2,0, TRUE} // 94 + , {doBackslashb, 98 /* b */, 2,0, TRUE} // 95 + , {doBackslashd, 100 /* d */, 14,0, TRUE} // 96 + , {doBackslashD, 68 /* D */, 14,0, TRUE} // 97 + , {doBackslashG, 71 /* G */, 2,0, TRUE} // 98 + , {doProperty, 78 /* N */, 14,0, FALSE} // 99 + , {doProperty, 112 /* p */, 14,0, FALSE} // 100 + , {doProperty, 80 /* P */, 14,0, FALSE} // 101 + , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 102 + , {doBackslashS, 83 /* S */, 14,0, TRUE} // 103 + , {doBackslashs, 115 /* s */, 14,0, TRUE} // 104 + , {doBackslashW, 87 /* W */, 14,0, TRUE} // 105 + , {doBackslashw, 119 /* w */, 14,0, TRUE} // 106 + , {doBackslashX, 88 /* X */, 14,0, TRUE} // 107 + , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 108 + , {doBackslashz, 122 /* z */, 2,0, TRUE} // 109 + , {doBackRef, 129, 14,0, TRUE} // 110 + , {doEscapeError, 253, 113,0, FALSE} // 111 + , {doLiteralChar, 255, 14,0, TRUE} // 112 + , {doExit, 255, 113,0, TRUE} // 113 errorDeath }; static const char * const RegexStateNames[] = { 0, "start", @@ -250,9 +269,21 @@ 0, 0, 0, + 0, "open-paren-lookbehind", 0, 0, + "open-paren-cond", + 0, + "open-paren-cond-ext", + 0, + 0, + 0, + "open-paren-cond-lb", + 0, + 0, + "paren-python", + 0, "paren-comment", 0, 0, diff -u -r icu/source/i18n/regexcst.txt icu.new/source/i18n/regexcst.txt --- icu/source/i18n/regexcst.txt 2003-11-08 02:01:42.000000000 +0000 +++ icu.new/source/i18n/regexcst.txt 2006-08-10 12:06:10.000000000 +0100 @@ -1,4 +1,3 @@ - #***************************************************************************** # # Copyright (C) 2002-2003, International Business Machines Corporation and others. @@ -74,9 +73,9 @@ '\' n backslash '|' n term doOrOperator ')' n pop doCloseParen - eof term doPatFinish + eof term doPatFinish default errorDeath doRuleError - + # @@ -138,15 +137,48 @@ 'w' paren-flag doBeginMatchMode 'x' paren-flag doBeginMatchMode '-' paren-flag doBeginMatchMode - '(' n errorDeath doConditionalExpr + '(' n open-paren-cond '{' n errorDeath doPerlInline + 'P' n paren-python default errorDeath doBadOpenParenType open-paren-lookbehind: '=' n term ^expr-cont doOpenLookBehind # (?<= '!' n term ^expr-cont doOpenLookBehindNeg # (?...), or possibly a Python +# named group reference, e.g. (?P=id). +# +paren-python: + '<' term ^expr-quant doOpenPyNamedParen # (?P + '=' expr-quant doPyNamedBackRef # (?P=name) # # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' @@ -154,7 +186,7 @@ # paren-comment: ')' n pop - eof errorDeath doMismatchedParenErr + eof errorDeath doMismatchedParenErr default n paren-comment # @@ -213,7 +245,7 @@ interval-lower: digit_char n interval-lower doIntevalLowerDigit - ',' n interval-upper + ',' n interval-upper '}' n interval-type doIntervalSame # {n} default errorDeath doIntervalError @@ -250,9 +282,9 @@ 'X' n expr-quant doBackslashX 'Z' n term doBackslashZ 'z' n term doBackslashz - digit_char n expr-quant doBackRef # Will scan multiple digits + digit_char n expr-quant doBackRef # Will scan multiple digits eof errorDeath doEscapeError - default n expr-quant doLiteralChar # Escaped literal char. + default n expr-quant doLiteralChar # Escaped literal char. # diff -u -r icu/source/i18n/regeximp.h icu.new/source/i18n/regeximp.h --- icu/source/i18n/regeximp.h 2004-12-30 07:25:50.000000000 +0000 +++ icu.new/source/i18n/regeximp.h 2006-08-10 00:53:23.000000000 +0100 @@ -168,10 +168,20 @@ // Operand value: // 0: Normal (. doesn't match new-line) mode. // 1: . matches new-line mode. - URX_BACKSLASH_BU = 53 // \b or \B in UREGEX_UWORD mode, using Unicode style + URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style // word boundaries. - -}; + URX_NAMED_BACKREF = 54, // A named back-reference; this gets replaced with a normal + // back-reference during compilation. Operand is the name index. + URX_NMD_BACKREF_I = 55, // Case insensitive named back-reference. + URX_CONDITION = 56, // A conditional skip; only matches if the specified group has been + // matched. If it has, skips the next instruction. + // Parameter is the index of the capture group variables in + // the state stack frame. + URX_NAMED_CONDTN = 57, // Like URX_NAMED_BACKREF, this is replaced during compilation. + // Parameter is the name index of the name. + URX_STATE_POP = 58 // Pop a saved state from the stack without backtracking. + // Basically just cancels the last STATE_SAVE. +}; // Keep this list of opcode names in sync with the above enum // Used for debug printing only. @@ -229,8 +239,12 @@ "LOOP_SR_I", \ "LOOP_C", \ "LOOP_DOT_I", \ - "BACKSLASH_BU" - + "BACKSLASH_BU", \ + "NAMED_BACKREF", \ + "NAMED_BACKREF_I", \ + "CONDITION", \ + "NAMED_CONDITION", \ + "STATE_POP" // // Convenience macros for assembling and disassembling a compiled operation. diff -u -r icu/source/i18n/rematch.cpp icu.new/source/i18n/rematch.cpp --- icu/source/i18n/rematch.cpp 2005-06-17 01:42:54.000000000 +0100 +++ icu.new/source/i18n/rematch.cpp 2006-08-10 00:51:33.000000000 +0100 @@ -96,7 +96,6 @@ if (U_FAILURE(status)) { return; } - if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); } @@ -299,6 +298,31 @@ +int32_t RegexMatcher::end(const UnicodeString &groupName, UErrorCode &err) const { + if (U_FAILURE(err)) { + return -1; + } + if (fMatch == FALSE) { + err = U_REGEX_INVALID_STATE; + return -1; + } + + int32_t namedGroupCount = fPattern->fNamedGroups->size(); + int32_t nameIndex; + for (nameIndex = 0; nameIndex < namedGroupCount; ++nameIndex) { + UnicodeString *name = (UnicodeString *)fPattern->fNamedGroups->elementAt(nameIndex); + + if (*name == groupName) { + int32_t groupNum = fPattern->fNamedGroupMap->elementAti(nameIndex); + return end(groupNum + 1, err); + } + } + + err = U_REGEX_UNKNOWN_GROUP_NAME; + return -1; +} + + //-------------------------------------------------------------------------------- // // find() @@ -541,6 +565,48 @@ } +UnicodeString RegexMatcher::group(const UnicodeString &groupName, + UErrorCode &status) const { + if (U_FAILURE(status)) { + return UnicodeString(); + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return UnicodeString(); + } + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + return UnicodeString(); + } + + int32_t namedGroupCount = fPattern->fNamedGroups->size(); + int32_t nameIndex; + for (nameIndex = 0; nameIndex < namedGroupCount; ++nameIndex) { + UnicodeString *name = (UnicodeString *)fPattern->fNamedGroups->elementAt(nameIndex); + + if (*name == groupName) { + int32_t groupNum = fPattern->fNamedGroupMap->elementAti(nameIndex); + return group(groupNum + 1, status); + } + } + + status = U_REGEX_UNKNOWN_GROUP_NAME; + return UnicodeString(); +} + + + +// This is only here because groupCount() is a matcher method +int32_t RegexMatcher::groupIndexFromName(const UnicodeString &groupName, + UErrorCode &status) const { + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return -1; + } + + return fPattern->groupIndexFromName (groupName, status); +} + int32_t RegexMatcher::groupCount() const { @@ -858,7 +924,6 @@ - int32_t RegexMatcher::start(int group, UErrorCode &status) const { if (U_FAILURE(status)) { return -1; @@ -889,6 +954,32 @@ +int32_t RegexMatcher::start(const UnicodeString &groupName, + UErrorCode &err) const { + if (U_FAILURE(err)) { + return -1; + } + if (fMatch == FALSE) { + err = U_REGEX_INVALID_STATE; + return -1; + } + + int32_t namedGroupCount = fPattern->fNamedGroups->size(); + int32_t nameIndex; + for (nameIndex = 0; nameIndex < namedGroupCount; ++nameIndex) { + UnicodeString *name = (UnicodeString *)fPattern->fNamedGroups->elementAt(nameIndex); + + if (*name == groupName) { + int32_t groupNum = fPattern->fNamedGroupMap->elementAti(nameIndex); + return start(groupNum + 1, err); + } + } + + err = U_REGEX_UNKNOWN_GROUP_NAME; + return -1; +} + + //================================================================================ // // Code following this point in this file is the internal @@ -1093,6 +1184,9 @@ fData[i] = 0; } + // A string to hold the name of the capturing group + int32_t captureNameIdx = -1; + // // Main loop for interpreting the compiled pattern. // One iteration of the loop per pattern operation performed. @@ -1192,6 +1286,14 @@ fp = StateSave(fp, opValue, frameSize, status); break; + case URX_STATE_POP: + { + // Pop the state without moving the pattern index + int32_t nextOp = fp->fPatIdx; + fp = (REStackFrame *)fStack->popFrame(frameSize); + fp->fPatIdx = nextOp; + } + break; case URX_END: // The match loop will exit via this path on a successful match, @@ -1620,6 +1722,17 @@ } break; + case URX_CONDITION: + { + U_ASSERT(opValue < frameSize); + int32_t groupStartIdx = fp->fExtra[opValue]; + + if (groupStartIdx >= 0) { + // This group has been matched, so skip the next instruction + fp->fPatIdx++; + } + } + break; case URX_JMP: fp->fPatIdx = opValue; diff -u -r icu/source/i18n/repattrn.cpp icu.new/source/i18n/repattrn.cpp --- icu/source/i18n/repattrn.cpp 2004-12-30 07:25:50.000000000 +0000 +++ icu.new/source/i18n/repattrn.cpp 2006-08-10 00:52:09.000000000 +0100 @@ -57,6 +57,11 @@ // Assignmenet Operator // //-------------------------------------------------------------------------- +static void copyStringTok(UHashTok *a, UHashTok *b) +{ + a->pointer = new UnicodeString (*(UnicodeString *)b->pointer); +} + RegexPattern &RegexPattern::operator = (const RegexPattern &other) { if (this == &other) { // Source and destination are the same. Don't do anything. @@ -91,6 +96,8 @@ // Copy the pattern. It's just values, nothing deep to copy. fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus); fGroupMap->assign(*other.fGroupMap, fDeferredStatus); + fNamedGroups->assign(*other.fNamedGroups, (UTokenAssigner *)copyStringTok, fDeferredStatus); + fNamedGroupMap->assign(*other.fNamedGroupMap, fDeferredStatus); // Copy the Unicode Sets. // Could be made more efficient if the sets were reference counted and shared, @@ -135,6 +142,8 @@ fFrameSize = 0; fDataSize = 0; fGroupMap = NULL; + fNamedGroups = NULL; + fNamedGroupMap = NULL; fMaxCaptureDigits = 1; fStaticSets = NULL; fStaticSets8 = NULL; @@ -147,14 +156,16 @@ fCompiledPat = new UVector32(fDeferredStatus); fGroupMap = new UVector32(fDeferredStatus); + fNamedGroups = new UVector(fDeferredStatus); + fNamedGroupMap = new UVector32(fDeferredStatus); fSets = new UVector(fDeferredStatus); fInitialChars = new UnicodeSet; fInitialChars8 = new Regex8BitSet; if (U_FAILURE(fDeferredStatus)) { return; } - if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || - fInitialChars == NULL || fInitialChars8 == NULL) { + if (fCompiledPat == NULL || fGroupMap == NULL || fNamedGroups == NULL || fNamedGroupMap == NULL + || fSets == NULL || fInitialChars == NULL || fInitialChars8 == NULL) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return; } @@ -186,6 +197,17 @@ fSets8 = NULL; delete fGroupMap; fGroupMap = NULL; + delete fNamedGroupMap; + fNamedGroupMap = NULL; + for (i = 0; i < fNamedGroups->size(); ++i) { + UnicodeString *s; + s = (UnicodeString *)fNamedGroups->elementAt(i); + if (s != NULL) { + delete s; + } + } + delete fNamedGroups; + fNamedGroups = NULL; delete fInitialChars; fInitialChars = NULL; delete fInitialChars8; @@ -404,6 +426,95 @@ //--------------------------------------------------------------------- // +// getGroupNames +// +//--------------------------------------------------------------------- +int32_t RegexPattern::getGroupNames(int32_t startIndex, + UnicodeString dest[], + int32_t destCapacity, + UErrorCode &status) { + if (U_FAILURE(status)) { + return 0; + } + + int32_t count = fNamedGroups->size(); + + if (!destCapacity) + return count; + + if (startIndex >= count) { + // This is not an error, the intention being that users of this + // method can write something like this: + // + // UnicodeString names[10]; + // int32_t count, index; + // + // while ((count = pattern->getGroupNames(index, names, 10, &status)) + // && U_SUCCESS(status)) { + // ... + // index += count; + // } + return 0; + } + + count -= startIndex; + if (count > destCapacity) + count = destCapacity; + + int32_t n; + for (n = 0; n < count; ++n) { + dest[n] = *(UnicodeString *)fNamedGroups->elementAt(n + startIndex); + } + + return count; +} + + + + + +//--------------------------------------------------------------------- +// +// groupIndexFromName +// +//--------------------------------------------------------------------- +int32_t RegexPattern::groupIndexFromName(const UnicodeString &groupName, + UErrorCode &status) const { + if (U_FAILURE(status)) { + return 0; + } + + int32_t namedGroupCount = fNamedGroups->size(); + int32_t nameIndex; + for (nameIndex = 0; nameIndex < namedGroupCount; ++nameIndex) { + UnicodeString *name = (UnicodeString *)fNamedGroups->elementAt(nameIndex); + + if (*name == groupName) { + return fNamedGroupMap->elementAti(nameIndex) + 1; + } + } + + status = U_REGEX_UNKNOWN_GROUP_NAME; + return -1; +} + + + + +//--------------------------------------------------------------------- +// +// groupCount +// +//--------------------------------------------------------------------- +int32_t RegexPattern::groupCount() const { + return fGroupMap->size(); +} + + + + +//--------------------------------------------------------------------- +// // split // //--------------------------------------------------------------------- @@ -443,6 +554,7 @@ REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType])); switch (type) { case URX_NOP: + case URX_STATE_POP: case URX_DOTANY: case URX_DOTANY_ALL: case URX_DOTANY_PL: @@ -458,6 +570,7 @@ // Types with no operand field of interest. break; + case URX_CONDITION: case URX_RESERVED_OP: case URX_START_CAPTURE: case URX_END_CAPTURE: @@ -551,7 +664,6 @@ } #endif - #if defined(REGEX_DEBUG) U_CAPI void U_EXPORT2 RegexPatternDump(const RegexPattern *This) { diff -u -r icu/source/i18n/unicode/regex.h icu.new/source/i18n/unicode/regex.h --- icu/source/i18n/unicode/regex.h 2005-06-23 00:38:06.000000000 +0100 +++ icu.new/source/i18n/unicode/regex.h 2006-08-10 14:46:25.000000000 +0100 @@ -16,7 +16,7 @@ #ifndef REGEX_H #define REGEX_H -//#define REGEX_DEBUG +// #define REGEX_DEBUG /** * \file @@ -329,6 +329,48 @@ /** + * Fills in a list of the named groups present in this regular expression, + * returning the number of groups in the list. Each call will fetch up + * to destCapacity elements starting at startIndex. If the method is + * called with destCapacity equal to zero, it returns the total number of + * elements that can be fetched. + * + * @param startIndex The first index to fetch. + * @param dest An array of UnicodeStrings to receive the results. + * @param destCapacity The number of elements in the destination array. + * @param status A reference to a UErrorCode to receive any errors. + * @return The number of elements in the destination array + * that were used to hold names, *or*, if destCapacity + * is zero, the total number of available elements. + * + */ + virtual int32_t getGroupNames(int32_t startIndex, + UnicodeString dest[], + int32_t destCapacity, + UErrorCode &status); + + + /** + * Returns the group index for the named group with the specified name. + * + * @param groupName the capture group name + * @param status A reference to a UErrorCode to receive any errors. + * + * @return the index of the group, or a -ve number to indicate failure. + * + */ + virtual int32_t groupIndexFromName(const UnicodeString &groupName, + UErrorCode &status) const; + + + /** + * Returns the number of capturing groups in this pattern. + * @return the number of capture groups + */ + virtual int32_t groupCount() const; + + + /** * Split a string into fields. Somewhat like split() from Perl. * The pattern matches identify delimiters that separate the input * into fields. The input data between the matches becomes the @@ -405,6 +447,9 @@ UVector32 *fGroupMap; // Map from capture group number to position of // the group's variables in the matcher stack frame. + UVector *fNamedGroups; // A vector of group names. + UVector32 *fNamedGroupMap; // Map from name index to position of the group's + // variables in the matcher stack int32_t fMaxCaptureDigits; @@ -622,6 +667,35 @@ /** + * Returns a string containing the text captured by the given group + * during the previous match operation. + * + * @param groupName the capture group name + * @param status A reference to a UErrorCode to receive any errors. + * Possible errors are U_REGEX_INVALID_STATE if no match + * has been attempted or the last match failed and + * U_REGEX_UNKNOWN_GROUP_NAME for a bad capture group name. + * @return the captured text + */ + virtual UnicodeString group(const UnicodeString &groupName, + UErrorCode &status) const; + + /** + * Returns the group index for the named group with the specified name. + * Actually calls the similarly named method on the RegexPattern object; + * this method is only provided because of the presence of groupCount() + * on the matcher. + * + * @param groupName the capture group name + * @param status A reference to a UErrorCode to receive any errors. + * + * @return the index of the group, or a -ve number to indicate failure. + * + */ + virtual int32_t groupIndexFromName(const UnicodeString &groupName, + UErrorCode &status) const; + + /** * Returns the number of capturing groups in this matcher's pattern. * @return the number of capture groups * @stable ICU 2.4 @@ -656,6 +730,22 @@ /** + * Returns the index in the input string of the start of the text matched by the + * specified capture group during the previous match operation. Return -1 if + * the capture group exists in the pattern, but was not part of the last match. + * + * @param groupName the capture group name + * @param status A reference to a UErrorCode to receive any errors. Possible + * errors are U_REGEX_INVALID_STATE if no match has been + * attempted or the last match failed, and + * U_REGEX_UNKNOWN_GROUP_NAME for a bad capture group number + * @return the start position of substring matched by the specified group. + */ + virtual int32_t start(const UnicodeString &groupName, + UErrorCode &status) const; + + + /** * Returns the index in the input string of the first character following the * text matched during the previous match operation. * @param status A reference to a UErrorCode to receive any errors. Possible @@ -684,6 +774,21 @@ /** + * Returns the index in the input string of the character following the + * text matched by the specified capture group during the previous match operation. + * @param groupName the capture group name + * @param status A reference to a UErrorCode to receive any errors. Possible + * errors are U_REGEX_INVALID_STATE if no match has been + * attempted or the last match failed and + * U_REGEX_UNKNOWN_GROUP_NAME for a bad capture group number + * @return the index of the first character following the text + * captured by the specifed group during the previous match operation. + * Return -1 if the capture group exists in the pattern but was not part of the match. + */ + virtual int32_t end(const UnicodeString &groupName, UErrorCode &status) const; + + + /** * Resets this matcher. The effect is to remove any memory of previous matches, * and to cause subsequent find() operations to begin at the beginning of * the input string. diff -u -r icu/source/i18n/unicode/uregex.h icu.new/source/i18n/unicode/uregex.h --- icu/source/i18n/unicode/uregex.h 2005-06-28 23:23:06.000000000 +0100 +++ icu.new/source/i18n/unicode/uregex.h 2006-08-09 01:24:26.000000000 +0100 @@ -332,7 +332,8 @@ uregex_groupCount(URegularExpression *regexp, UErrorCode *status); -/** Extract the string for the specified matching expression or subexpression. +/** + * Extract the string for the specified matching expression or subexpression. * Group #0 is the complete string of matched text. * Group #1 is the text matched by the first set of capturing parentheses. * @@ -355,6 +356,56 @@ int32_t destCapacity, UErrorCode *status); +/** + * Returns the group index for the named group with the specified name, + * provided that the named group was matched by the regular expression. + * + * @param regexp The compiled regular expression. + * @param name The capture group name. + * @param status A reference to a UErrorCode to receive any errors. + * @return The index of the corresponding numbered group. + * If there is no group with the specified name, + * -1 is returned with U_REGEX_UNKNOWN_GROUP_NAME in + * status. + */ +U_DRAFT int32_t U_EXPORT2 +uregex_groupIndexFromName(URegularExpression *regexp, + const UChar *name, + int32_t nameLen, + UErrorCode *status); + + +/** + * Returns the number of named groups in this regular expression. + * + * @param regexp The compiled regular expression. + * @return A count of the named groups in this regexp. + */ +U_DRAFT int32_t U_EXPORT2 +uregex_namedGroupCount(URegularExpression *regexp, + UErrorCode *status); + + + +/** + * Returns the name of the specified named group. + * + * @param regexp The compiled regular expression. + * @param namedGrpNum The index of the named group, in the range 0 + * to uregex_namedGroupCount(). + * @param dest Buffer to receive the matching string data + * @param destCapacity Capacity of the dest buffer. + * @param status A reference to a UErrorCode to receive any errors. + * @return The length of the name, or zero on failure. + */ +U_DRAFT int32_t U_EXPORT2 +uregex_namedGroupName(URegularExpression *regexp, + int32_t namedGrpNum, + UChar *dest, + int32_t destCapacity, + UErrorCode *status); + + /** * Returns the index in the input string of the start of the text matched by the diff -u -r icu/source/i18n/uregex.cpp icu.new/source/i18n/uregex.cpp --- icu/source/i18n/uregex.cpp 2005-06-21 17:54:40.000000000 +0100 +++ icu.new/source/i18n/uregex.cpp 2006-08-09 01:29:20.000000000 +0100 @@ -432,6 +432,79 @@ //------------------------------------------------------------------------------ // +// uregex_groupIndexFromName +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_groupIndexFromName(URegularExpression *regexp, + const UChar *name, + int32_t nameLen, + UErrorCode *status) { + if (validateRE(regexp, status) == FALSE) { + return NULL; + } + if (name == NULL || nameLen < -1) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + // Make a read-only-aliased copy of the name string. + UnicodeString nameString(nameLen == -1, name, nameLen); + + return regexp->fMatcher->groupIndexFromName(nameString, *status); +} + + +//------------------------------------------------------------------------------ +// +// uregex_namedGroupCount +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_namedGroupCount(URegularExpression *regexp, + UErrorCode *status) +{ + if (validateRE(regexp, status) == FALSE) { + return 0; + } + int32_t result = regexp->fPat->getGroupNames(0, NULL, 0, *status); + return result; +} + + +//------------------------------------------------------------------------------ +// +// uregex_namedGroupName +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_namedGroupName(URegularExpression *regexp, + int32_t namedGrpNum, + UChar *dest, + int32_t destCapacity, + UErrorCode *status) +{ + if (validateRE(regexp, status) == FALSE) { + return 0; + } + if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + UnicodeString name; + int32_t result = regexp->fPat->getGroupNames(namedGrpNum, &name, 1, *status); + + if (U_FAILURE(*status)) { + return 0; + } + + return name.extract(dest, destCapacity, *status); +} + + +//------------------------------------------------------------------------------ +// // uregex_start // //------------------------------------------------------------------------------ diff -u -r icu/source/test/cintltst/reapits.c icu.new/source/test/cintltst/reapits.c --- icu/source/test/cintltst/reapits.c 2004-12-16 02:54:24.000000000 +0000 +++ icu.new/source/test/cintltst/reapits.c 2006-08-09 01:33:45.000000000 +0100 @@ -531,6 +531,69 @@ } /* + * groupIndexFromName() + */ + { + UChar text1[80]; + UChar buf[80]; + UBool result; + int32_t groupIndex; + int32_t count; + + u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); + + status = U_ZERO_ERROR; + re = uregex_openC("(?:abc(?P.*?)def|(?Pergodic))", + 0, NULL, &status); + TEST_ASSERT_SUCCESS(status); + + uregex_setText(re, text1, -1, &status); + result = uregex_find(re, 0, &status); + TEST_ASSERT(result == TRUE); + + status = U_ZERO_ERROR; + count = uregex_namedGroupCount(re, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(count == 2); + + /* Check that extracting group names works */ + status = U_ZERO_ERROR; + count = uregex_namedGroupName(re, 0, buf, sizeof(buf) / 2, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_STRING("foo", buf, TRUE); + TEST_ASSERT(count == 3); + + status = U_ZERO_ERROR; + count = uregex_namedGroupName(re, 1, buf, sizeof(buf) / 2, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT_STRING("bar", buf, TRUE); + TEST_ASSERT(count == 3); + + /* Capture group "foo". Should succeed and return 1. */ + status = U_ZERO_ERROR; + u_uastrncpy(buf, "foo", sizeof(buf)/2); + groupIndex = uregex_groupIndexFromName(re, buf, -1, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(groupIndex == 1); + + /* Capture group "bar". Should succeed and return 2. */ + status = U_ZERO_ERROR; + u_uastrncpy(buf, "bar", sizeof(buf)/2); + groupIndex = uregex_groupIndexFromName(re, buf, -1, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(groupIndex == 2); + + /* Capture group "pants". Should fail. */ + status = U_ZERO_ERROR; + u_uastrncpy(buf, "pants", sizeof(buf)/2); + groupIndex = uregex_groupIndexFromName(re, buf, -1, &status); + TEST_ASSERT(status == U_REGEX_UNKNOWN_GROUP_NAME); + TEST_ASSERT(groupIndex == -1); + + uregex_close(re); + } + + /* * replaceFirst() */ { diff -u -r icu/source/test/intltest/regextst.cpp icu.new/source/test/intltest/regextst.cpp --- icu/source/test/intltest/regextst.cpp 2005-07-05 19:39:00.000000000 +0100 +++ icu.new/source/test/intltest/regextst.cpp 2006-08-10 14:30:06.000000000 +0100 @@ -67,7 +67,6 @@ if (exec) PerlTests(); break; - default: name = ""; break; //needed to end loop } @@ -775,6 +774,394 @@ } + // Conditional matching (there are lots of tests of this in the Perl test suite; we only test + // the named group support here because the Perl suite tests the rest) + { + int32_t flags = 0; + UParseError pe; + UErrorCode status = U_ZERO_ERROR; + + // Contrast this with (?(1)a|b), which works for compatibility with Perl + UnicodeString re("(?(name)a|b)"); + REGEX_ASSERT_FAIL (RegexPattern::compile(re, flags, pe, status), + U_REGEX_UNKNOWN_GROUP_NAME); + + // This regexp (based on one from the Perl tests) matches as follows: + // + // a a a a a a a a a a ... + // | | | | | | | | | | + // | | | | | | +-----+ + // | | | +---+ | + // | +-+ | +--- Fourth iteration + // | | +----------- Third iteration + // | +----------------- Second iteration + // +--------------------- First iteration + // + re = "^(?Pa(?(name)(?P=name))){4}$"; + RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + + UnicodeString data = "aaaaaaaaaa"; + RegexMatcher *matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + delete matcher; + data = "aaaaaaaaa"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + + delete matcher; + data = "aaaaaaaaaaa"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + + delete pat; + + // Test a conditional without the false-part + re = "^(?Pfoo)?(?(name)bar)bly$"; + pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + + data = "foobarbly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + delete matcher; + + data = "foobly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + delete matcher; + + data = "barbly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + delete matcher; + + data = "bly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + delete matcher; + + delete pat; + + // Test a repeated conditional + re = "^(?Pfoo)?(?(name)bar){4}bly$"; + pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + + data = "foobarbarbarbarbly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + delete matcher; + + data = "foobarbarbly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + delete matcher; + + data = "foobly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + delete matcher; + + data = "barbarbarbarbly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + delete matcher; + + data = "bly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + delete matcher; + + delete pat; + + // Test a repeated conditional with a false-part + re = "^(?Pfoo)?(?(name)bar|ber){4}bly$"; + pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + + data = "foobarbarbarbarbly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + delete matcher; + + data = "foobarbarbly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + delete matcher; + + data = "foobly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + delete matcher; + + data = "barbarbarbarbly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + delete matcher; + + data = "berberberberbly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + delete matcher; + + data = "bly"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + delete matcher; + + delete pat; + + // Test a conditional with a lookahead expression + re = "^(?(?=foo)foobar|f..baz)$"; + pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + + data = "foobar"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + delete matcher; + data = "foobaz"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + + delete matcher; + data = "foubaz"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + delete matcher; + delete pat; + + // Test a conditional with a negated lookahead expression + re = "^(?(?!foo)f..baz|foobar)$"; + pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + + data = "foobar"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + delete matcher; + data = "foobaz"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + + delete matcher; + data = "foubaz"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + delete matcher; + delete pat; + + // Test a conditional with a lookbehind expression + re = "^f..(?(?<=foo)bar|b..)$"; + pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + + data = "foobar"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + delete matcher; + data = "foobaz"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + + delete matcher; + data = "foubaz"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + delete matcher; + delete pat; + + // Test a conditional with a negative lookbehind expression + re = "^f..(?(?matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + delete matcher; + data = "foobaz"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + + delete matcher; + data = "foubaz"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + delete matcher; + delete pat; + } + + // Named capture group + { + int32_t flags = 0; + UParseError pe; + UErrorCode status = U_ZERO_ERROR; + + UnicodeString re("01(?P23(?P45)67)(?P=bar)" + "(?Pkerpow)?"); + RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + UnicodeString data = "0123456745"; + UnicodeString groupNames[2]; + + REGEX_ASSERT(pat->groupIndexFromName("foo", status) == 1); + REGEX_ASSERT(pat->groupIndexFromName("bar", status) == 2); + REGEX_ASSERT(pat->groupCount() == 3); + REGEX_ASSERT(pat->getGroupNames(0, groupNames, 0, status) == 3); + REGEX_ASSERT(pat->getGroupNames(0, groupNames, 2, status) == 2); + REGEX_ASSERT(groupNames[0] == "foo"); + REGEX_ASSERT(groupNames[1] == "bar"); + REGEX_ASSERT(pat->getGroupNames(1, groupNames, 1, status) == 1); + REGEX_ASSERT(groupNames[0] == "bar"); + REGEX_ASSERT(pat->getGroupNames(2, groupNames, 2, status) == 1); + REGEX_ASSERT(groupNames[0] == "blat"); + REGEX_CHECK_STATUS; + + RegexMatcher *matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + REGEX_ASSERT(matcher->groupIndexFromName("foo", status) == 1); + REGEX_ASSERT(matcher->groupIndexFromName("bar", status) == 2); + REGEX_ASSERT(matcher->groupCount() == 3); + REGEX_ASSERT(matcher->group(0, status) == "0123456745"); + REGEX_ASSERT(matcher->group(1, status) == "234567"); + REGEX_ASSERT(matcher->group(2, status) == "45"); + REGEX_ASSERT(matcher->group("foo", status) == "234567"); + REGEX_ASSERT(matcher->group("bar", status) == "45"); + REGEX_ASSERT(matcher->group("blat", status) == ""); + REGEX_ASSERT(matcher->start("foo", status) == 2); + REGEX_ASSERT(matcher->end("foo", status) == 8); + REGEX_ASSERT(matcher->start("bar", status) == 4); + REGEX_ASSERT(matcher->end("bar", status) == 6); + REGEX_CHECK_STATUS; + REGEX_ASSERT_FAIL(matcher->group("pants", status), + U_REGEX_UNKNOWN_GROUP_NAME); + + // Test some bad patterns + UnicodeString re2("The (?Pquick) (?Pbrown) fox"); + REGEX_ASSERT_FAIL(RegexPattern::compile(re2, flags, pe, status), + U_REGEX_DUPLICATE_GROUP_NAME); + + UnicodeString re3("The (?P<>bad name)"); + REGEX_ASSERT_FAIL(RegexPattern::compile(re3, flags, pe, status), + U_REGEX_BAD_GROUP_NAME); + + UnicodeString re4("The (?P=)"); + REGEX_ASSERT_FAIL(RegexPattern::compile(re4, flags, pe, status), + U_REGEX_BAD_GROUP_NAME); + + UnicodeString re5("The (?Pgroup) (?P=unknown)"); + REGEX_ASSERT_FAIL(RegexPattern::compile(re5, flags, pe, status), + U_REGEX_UNKNOWN_GROUP_NAME); + + delete matcher; + delete pat; + + // Check that backrefs work in case-sensitive and case-insensitive mode + UnicodeString re6("This is (?Pcase) (?:in)?sensitive: (?P=name)"); + + pat = RegexPattern::compile(re6, flags | UREGEX_CASE_INSENSITIVE, pe, status); + REGEX_CHECK_STATUS; + data = "This is case insensitive: CASE"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT (matcher->lookingAt(status) == TRUE); + REGEX_CHECK_STATUS; + delete matcher; + data = "This is case insensitive: case"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT (matcher->lookingAt(status) == TRUE); + REGEX_CHECK_STATUS; + delete matcher; + delete pat; + + pat = RegexPattern::compile(re6, flags & ~UREGEX_CASE_INSENSITIVE, pe, status); + REGEX_CHECK_STATUS; + data = "This is case sensitive: CASE"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT (matcher->lookingAt(status) == FALSE); + REGEX_CHECK_STATUS; + delete matcher; + data = "This is case sensitive: case"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT (matcher->lookingAt(status) == TRUE); + REGEX_CHECK_STATUS; + delete matcher; + delete pat; + + // Forward references should work too; Python doesn't support this, + // but we do for consistency with numbered backrefs. Note that forward + // references still work backwards in time, so + // + // (?P=foo)(?PABC) + // + // will not match anything (because group "foo" contains nothing). + UnicodeString re7("^(?:ABC(?P=foo)|(?P[a-z]{3}))+$"); + pat = RegexPattern::compile(re7, flags, pe, status); + REGEX_CHECK_STATUS; + data = "defABCdefABCdefhijABChij"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + REGEX_CHECK_STATUS; + + delete matcher; + data = "defABCdefABCdefhijABCdef"; + matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == FALSE); + REGEX_CHECK_STATUS; + + delete matcher; + delete pat; + } + // // find //