diff -r -u icu/source/common/unicode/utypes.h icu.new/source/common/unicode/utypes.h --- icu/source/common/unicode/utypes.h 2005-06-28 23:18:08.000000000 +0100 +++ icu.new/source/common/unicode/utypes.h 2006-08-06 14:40:17.000000000 +0100 @@ -706,6 +706,8 @@ U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */ U_REGEX_LOOK_BEHIND_LIMIT, /**< Look-Behind pattern matches must have a bounded maximum length. */ U_REGEX_SET_CONTAINS_STRING, /**< Regexps cannot have UnicodeSets containing strings.*/ + U_REGEX_UNKNOWN_GROUP_NAME, /**< Unrecognized group name. */ + U_REGEX_UNTERMINATED_GROUP_NAME, /**< Unterminated group name. */ U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */ /* diff -r -u icu/source/common/utypes.c icu.new/source/common/utypes.c --- icu/source/common/utypes.c 2005-07-12 21:32:00.000000000 +0100 +++ icu.new/source/common/utypes.c 2006-08-05 23:27:54.000000000 +0100 @@ -154,7 +154,9 @@ "U_REGEX_INVALID_BACK_REF", "U_REGEX_INVALID_FLAG", "U_REGEX_LOOK_BEHIND_LIMIT", - "U_REGEX_SET_CONTAINS_STRING" + "U_REGEX_SET_CONTAINS_STRING", + "U_REGEX_UNKNOWN_GROUP_NAME", + "U_REGEX_UNTERMINATED_GROUP_NAME" }; /* TODO: replace the definition with _uIDNAErrorName[U_IDNA_ERROR_LIMIT - U_IDNA_ERROR_START ] * in ICU 3.6 diff -r -u icu/source/i18n/regexcmp.cpp icu.new/source/i18n/regexcmp.cpp --- icu/source/i18n/regexcmp.cpp 2004-12-30 07:25:50.000000000 +0000 +++ icu.new/source/i18n/regexcmp.cpp 2006-08-06 14:41:08.000000000 +0100 @@ -1,4 +1,3 @@ - // // file: regexcmp.cpp // @@ -24,6 +23,7 @@ #include "util.h" #include "cmemory.h" #include "cstring.h" +#include "uvector.h" #include "uvectr32.h" #include "uassert.h" #include "ucln_in.h" @@ -48,7 +48,7 @@ // Constructor. // //------------------------------------------------------------------------------ -RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status) +RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status), fParenNameStack(status) { fStatus = &status; @@ -402,7 +402,22 @@ } break; + case doOpenPyNamedParen: + // Python-style named paren. + // Scan the name and push it onto the name stack, then proceed as for + // a normal capturing paren. + { + UnicodeString *name = scanName (); + + if (!name) { + break; + } + fParenNameStack.push(name, *fStatus); + } + + // No break; fall through here + case doOpenCaptureParen: // Open Paren. // Compile to a @@ -434,7 +449,11 @@ // NOPs may be changed to SAVE_STATE or JMP ops, with a target // address of the end of the parenthesized group. fParenStack.push(fModeFlags, *fStatus); // Match mode state - fParenStack.push(capturing, *fStatus); // Frame type. + if ((Regex_PatternParseAction)action == doOpenCaptureParen) { + fParenStack.push(capturing, *fStatus); // Frame type. + } else if ((Regex_PatternParseAction)action == doOpenPyNamedParen) { + fParenStack.push(named, *fStatus); + } fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc @@ -1199,6 +1218,35 @@ } break; + case doPyNamedBackRef: + // Named backreference. + { + // Find the name in the name table. We must have seen the name + // already at this point, otherwise using it is invalid. + UnicodeString *name = scanName(); + + if (!name) { + break; + } + + int32_t nameIndex; + int32_t nameCount = fRXPat->fNamedGroups->size (); + for (nameIndex = 0; nameIndex < nameCount; ++nameIndex) { + UnicodeString *otherName = (UnicodeString *)fRXPat->fNamedGroups->elementAt(nameIndex); + + if (*otherName == *name) + break; + } + + if (nameIndex >= nameCount) { + error(U_REGEX_UNKNOWN_GROUP_NAME); + break; + } + + int32_t op = URX_BUILD(URX_NAMEDBACKREF, nameIndex); + fRXPat->fCompiledPat->addElement(op, *fStatus); + } + break; case doPossessivePlus: // Possessive ++ quantifier. @@ -1750,6 +1798,38 @@ fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); } break; + case named: + // Named Parentheses. + // As for capturing, but with an additional name specification. + { + int32_t captureOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); + U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); + + // Get the name and find it; re-use it if it's already in the + // list of named groups for this pattern + UnicodeString *name = (UnicodeString *)fParenNameStack.pop(); + int32_t nameIndex; + int32_t nameCount = fRXPat->fNamedGroups->size (); + for (nameIndex = 0; nameIndex < nameCount; ++nameIndex) { + UnicodeString *otherName = (UnicodeString *)fRXPat->fNamedGroups->elementAt(nameIndex); + + if (*otherName == *name) + break; + } + + if (nameIndex == nameCount) { + fRXPat->fNamedGroups->addElement(name, *fStatus); + } + + int32_t endNameCaptureOp = URX_BUILD(URX_NAME_CAPTURE, + nameIndex); + fRXPat->fCompiledPat->addElement(endNameCaptureOp, *fStatus); + + int32_t frameVarLocation = URX_VAL(captureOp); + int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation); + fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); + } + break; case atomic: // Atomic Parenthesis. // Insert a LD_SP operation to restore the state stack to the position @@ -2109,6 +2189,7 @@ case URX_STRING_LEN: case URX_NOP: case URX_START_CAPTURE: + case URX_NAME_CAPTURE: case URX_END_CAPTURE: case URX_BACKSLASH_B: case URX_BACKSLASH_BU: @@ -2121,6 +2202,7 @@ case URX_BACKTRACK: case URX_BACKREF: // BackRef. Must assume that it might be a zero length match case URX_BACKREF_I: + case URX_NAMEDBACKREF: case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. case URX_LD_SP: @@ -2580,6 +2662,7 @@ case URX_STRING_LEN: case URX_NOP: case URX_START_CAPTURE: + case URX_NAME_CAPTURE: case URX_END_CAPTURE: case URX_BACKSLASH_B: case URX_BACKSLASH_BU: @@ -2594,6 +2677,7 @@ case URX_BACKTRACK: case URX_BACKREF: // BackRef. Must assume that it might be a zero length match case URX_BACKREF_I: + case URX_NAMEDBACKREF: case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. case URX_LD_SP: @@ -2820,6 +2904,7 @@ case URX_STRING_LEN: case URX_NOP: case URX_START_CAPTURE: + case URX_NAME_CAPTURE: case URX_END_CAPTURE: case URX_BACKSLASH_B: case URX_BACKSLASH_BU: @@ -2848,6 +2933,7 @@ // Call the max length unbounded, and stop further checking. case URX_BACKREF: // BackRef. Must assume that it might be a zero length match case URX_BACKREF_I: + case URX_NAMEDBACKREF: case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. case URX_DOTANY_PL: case URX_DOTANY_ALL_PL: @@ -3069,6 +3155,7 @@ case URX_STRING: case URX_STRING_LEN: case URX_START_CAPTURE: + case URX_NAME_CAPTURE: case URX_END_CAPTURE: case URX_STATIC_SETREF: case URX_STAT_SETREF_N: @@ -3091,6 +3178,7 @@ case URX_STO_SP: case URX_LD_SP: case URX_BACKREF: + case URX_NAMEDBACKREF: case URX_STO_INP_LOC: case URX_LA_START: case URX_LA_END: @@ -3157,6 +3245,7 @@ case URX_END: case URX_NOP: + case URX_NAME_CAPTURE: case URX_END_CAPTURE: case URX_DOLLAR_M: case URX_DOLLAR: @@ -3247,7 +3336,9 @@ static const UChar chUpperN = 0x4E; static const UChar chLowerP = 0x70; static const UChar chUpperP = 0x50; - +static const UChar chLAngle = 0x3c; // '<' +static const UChar chEquals = 0x3d; // '=' +static const UChar chRAngle = 0x3e; // '>' //------------------------------------------------------------------------------ // @@ -3527,6 +3618,62 @@ return uset; } + +//------------------------------------------------------------------------------ +// +// scanName Scan the name for a named group, e.g. (?Pfoo), or a named +// backreference, e.g. (?P=name). +// +// Case 1: Named group +// The scan position will be at the '<'. On return, the scan +// position should be just after the '>'. +// +// Case 2: Named backreference +// The scan position will be at the '='. On return the scan +// position should be just after the ')'. +// +// Return a UnicodeString containing the name, or NULL if the +// pattern is invalid. +// +//------------------------------------------------------------------------------ +UnicodeString *RegexCompile::scanName(void) +{ + UnicodeString *ustr = NULL; + UChar chTerm; + + if (U_FAILURE(*fStatus)) { + return NULL; + } + + U_ASSERT(fC.fChar == chLAngle || fC.fChar == chEquals); + + if (fC.fChar == chLAngle) + chTerm = chRAngle; + else + chTerm = chRParen; + + UnicodeString *name = new UnicodeString(); + if (!name) { + error(U_MEMORY_ALLOCATION_ERROR); + return NULL; + } + do { + nextChar(fC); + if (fC.fChar == -1) { + // Hit the end of the input string without finding the closing + // character + error(U_REGEX_UNTERMINATED_GROUP_NAME); + return NULL; + } + if (fC.fChar != chTerm) + name->append (fC.fChar); + } while (fC.fChar != chTerm); + + nextChar(fC); + + return name; +} + U_NAMESPACE_END #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS diff -r -u icu/source/i18n/regexcmp.h icu.new/source/i18n/regexcmp.h --- icu/source/i18n/regexcmp.h 2003-04-03 00:10:16.000000000 +0100 +++ icu.new/source/i18n/regexcmp.h 2006-08-06 14:41:46.000000000 +0100 @@ -77,7 +77,8 @@ negLookAhead = -5, flags = -6, lookBehind = -7, - lookBehindN = -8 + lookBehindN = -8, + named = -9 }; private: @@ -90,6 +91,7 @@ UChar32 peekCharLL(); UnicodeSet *scanSet(); UnicodeSet *scanProp(); + UnicodeString *scanName(); void handleCloseParen(); int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern // at the top of the just completed block @@ -175,6 +177,7 @@ // the kind of paren that opened the frame. Some // need special handling on close. + UStack fParenNameStack; // Parenthesis name stack. int32_t fMatchOpenParen; // The position in the compiled pattern // of the slot reserved for a state save diff -r -u icu/source/i18n/regexcst.h icu.new/source/i18n/regexcst.h --- icu/source/i18n/regexcst.h 2003-11-08 02:01:42.000000000 +0000 +++ icu.new/source/i18n/regexcst.h 2006-08-06 12:10:11.000000000 +0100 @@ -16,75 +16,77 @@ // // Character classes for regex pattern scanning. // - static const uint8_t kRuleSet_digit_char = 128; - static const uint8_t kRuleSet_white_space = 129; + static const uint8_t kRuleSet_white_space = 128; + static const uint8_t kRuleSet_digit_char = 129; static const uint8_t kRuleSet_rule_char = 130; enum Regex_PatternParseAction { - doPossessivePlus, - doCloseParen, - doProperty, - doBeginMatchMode, - doOrOperator, - doOpenCaptureParen, - doBadOpenParenType, - doRuleError, + doLiteralChar, + doBackslashA, + doNOP, + doBackslashG, + doPerlInline, + doPyNamedBackRef, doIntevalLowerDigit, - doBackslashs, - doNGOpt, - doBackslashw, - doMismatchedParenErr, - doOpenLookBehind, - doBackslashz, + doProperty, + doBackslashX, + doOpenAtomicParen, + doOpenPyNamedParen, + doPatFinish, + doNGPlus, + doOpenLookBehindNeg, doIntervalError, - doStar, - doCaret, - doEnterQuoteMode, - doNGStar, - doMatchMode, - doIntervalUpperDigit, - doOpenLookAheadNeg, + doIntervalSame, + doBackRef, doPlus, + doOpenCaptureParen, + doMismatchedParenErr, + doBeginMatchMode, + doEscapeError, doOpenNonCaptureParen, - doBackslashA, - doBackslashB, - doNGPlus, + doDollar, + doIntervalUpperDigit, + doBackslashs, + doOpenLookBehind, doSetMatchMode, - doPatFinish, - doBackslashD, - doPossessiveInterval, - doEscapeError, - doBackslashG, - doSuppressComments, + doOrOperator, + doCaret, doMatchModeParen, + doStar, doOpt, - doInterval, - doLiteralChar, - doIntervalInit, - doOpenAtomicParen, - doBackslashS, - doOpenLookAhead, - doBackRef, - doDollar, - doDotAny, + doMatchMode, + doSuppressComments, + doPossessiveInterval, + doOpenLookAheadNeg, doBackslashW, - doBackslashX, + doCloseParen, + doIntervalInit, doScanUnicodeSet, - doBackslashZ, - doPerlInline, + doNGStar, + doEnterQuoteMode, + doBackslashB, + doBackslashw, doPossessiveOpt, - doNOP, + doRuleError, + doBackslashb, doConditionalExpr, - doExit, + doPossessivePlus, + doBadOpenParenType, doNGInterval, - doPatStart, + doBackslashd, + doBackslashD, + doExit, + doInterval, + doNGOpt, + doBackslashS, + doBackslashZ, + doOpenLookAhead, doBadModeFlag, - doBackslashb, + doPatStart, doPossessiveStar, - doBackslashd, - doIntervalSame, - doOpenLookBehindNeg, + doBackslashz, + doDotAny, rbbiLastAction}; //------------------------------------------------------------------------------- @@ -112,15 +114,15 @@ , {doDotAny, 46 /* . */, 14,0, TRUE} // 6 , {doCaret, 94 /* ^ */, 2,0, TRUE} // 7 , {doDollar, 36 /* $ */, 2,0, TRUE} // 8 - , {doNOP, 92 /* \ */, 81,0, TRUE} // 9 + , {doNOP, 92 /* \ */, 84,0, TRUE} // 9 , {doOrOperator, 124 /* | */, 2,0, TRUE} // 10 , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11 , {doPatFinish, 253, 2,0, FALSE} // 12 - , {doRuleError, 255, 101,0, FALSE} // 13 - , {doNOP, 42 /* * */, 59,0, TRUE} // 14 expr-quant - , {doNOP, 43 /* + */, 62,0, TRUE} // 15 - , {doNOP, 63 /* ? */, 65,0, TRUE} // 16 - , {doIntervalInit, 123 /* { */, 68,0, TRUE} // 17 + , {doRuleError, 255, 104,0, FALSE} // 13 + , {doNOP, 42 /* * */, 62,0, TRUE} // 14 expr-quant + , {doNOP, 43 /* + */, 65,0, TRUE} // 15 + , {doNOP, 63 /* ? */, 68,0, TRUE} // 16 + , {doIntervalInit, 123 /* { */, 71,0, TRUE} // 17 , {doNOP, 40 /* ( */, 23,0, TRUE} // 18 , {doNOP, 255, 20,0, FALSE} // 19 , {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont @@ -128,7 +130,7 @@ , {doNOP, 255, 2,0, FALSE} // 22 , {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant , {doNOP, 255, 27,0, FALSE} // 24 - , {doNOP, 35 /* # */, 47, 14, TRUE} // 25 open-paren-quant2 + , {doNOP, 35 /* # */, 50, 14, TRUE} // 25 open-paren-quant2 , {doNOP, 255, 29,0, FALSE} // 26 , {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren , {doOpenCaptureParen, 255, 2, 14, FALSE} // 28 @@ -136,75 +138,78 @@ , {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30 , {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31 , {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32 - , {doNOP, 60 /* < */, 44,0, TRUE} // 33 - , {doNOP, 35 /* # */, 47, 2, TRUE} // 34 - , {doBeginMatchMode, 105 /* i */, 50,0, FALSE} // 35 - , {doBeginMatchMode, 109 /* m */, 50,0, FALSE} // 36 - , {doBeginMatchMode, 115 /* s */, 50,0, FALSE} // 37 - , {doBeginMatchMode, 119 /* w */, 50,0, FALSE} // 38 - , {doBeginMatchMode, 120 /* x */, 50,0, FALSE} // 39 - , {doBeginMatchMode, 45 /* - */, 50,0, FALSE} // 40 - , {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 41 - , {doPerlInline, 123 /* { */, 101,0, TRUE} // 42 - , {doBadOpenParenType, 255, 101,0, FALSE} // 43 - , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 44 open-paren-lookbehind - , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 45 - , {doBadOpenParenType, 255, 101,0, FALSE} // 46 - , {doNOP, 41 /* ) */, 255,0, TRUE} // 47 paren-comment - , {doMismatchedParenErr, 253, 101,0, FALSE} // 48 - , {doNOP, 255, 47,0, TRUE} // 49 - , {doMatchMode, 105 /* i */, 50,0, TRUE} // 50 paren-flag - , {doMatchMode, 109 /* m */, 50,0, TRUE} // 51 - , {doMatchMode, 115 /* s */, 50,0, TRUE} // 52 - , {doMatchMode, 119 /* w */, 50,0, TRUE} // 53 - , {doMatchMode, 120 /* x */, 50,0, TRUE} // 54 - , {doMatchMode, 45 /* - */, 50,0, TRUE} // 55 - , {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 56 - , {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 57 - , {doBadModeFlag, 255, 101,0, FALSE} // 58 - , {doNGStar, 63 /* ? */, 20,0, TRUE} // 59 quant-star - , {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 60 - , {doStar, 255, 20,0, FALSE} // 61 - , {doNGPlus, 63 /* ? */, 20,0, TRUE} // 62 quant-plus - , {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 63 - , {doPlus, 255, 20,0, FALSE} // 64 - , {doNGOpt, 63 /* ? */, 20,0, TRUE} // 65 quant-opt - , {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 66 - , {doOpt, 255, 20,0, FALSE} // 67 - , {doNOP, 129, 68,0, TRUE} // 68 interval-open - , {doNOP, 128, 71,0, FALSE} // 69 - , {doIntervalError, 255, 101,0, FALSE} // 70 - , {doIntevalLowerDigit, 128, 71,0, TRUE} // 71 interval-lower - , {doNOP, 44 /* , */, 75,0, TRUE} // 72 - , {doIntervalSame, 125 /* } */, 78,0, TRUE} // 73 - , {doIntervalError, 255, 101,0, FALSE} // 74 - , {doIntervalUpperDigit, 128, 75,0, TRUE} // 75 interval-upper - , {doNOP, 125 /* } */, 78,0, TRUE} // 76 - , {doIntervalError, 255, 101,0, FALSE} // 77 - , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 78 interval-type - , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 79 - , {doInterval, 255, 20,0, FALSE} // 80 - , {doBackslashA, 65 /* A */, 2,0, TRUE} // 81 backslash - , {doBackslashB, 66 /* B */, 2,0, TRUE} // 82 - , {doBackslashb, 98 /* b */, 2,0, TRUE} // 83 - , {doBackslashd, 100 /* d */, 14,0, TRUE} // 84 - , {doBackslashD, 68 /* D */, 14,0, TRUE} // 85 - , {doBackslashG, 71 /* G */, 2,0, TRUE} // 86 - , {doProperty, 78 /* N */, 14,0, FALSE} // 87 - , {doProperty, 112 /* p */, 14,0, FALSE} // 88 - , {doProperty, 80 /* P */, 14,0, FALSE} // 89 - , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 90 - , {doBackslashS, 83 /* S */, 14,0, TRUE} // 91 - , {doBackslashs, 115 /* s */, 14,0, TRUE} // 92 - , {doBackslashW, 87 /* W */, 14,0, TRUE} // 93 - , {doBackslashw, 119 /* w */, 14,0, TRUE} // 94 - , {doBackslashX, 88 /* X */, 14,0, TRUE} // 95 - , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 96 - , {doBackslashz, 122 /* z */, 2,0, TRUE} // 97 - , {doBackRef, 128, 14,0, TRUE} // 98 - , {doEscapeError, 253, 101,0, FALSE} // 99 - , {doLiteralChar, 255, 14,0, TRUE} // 100 - , {doExit, 255, 101,0, TRUE} // 101 errorDeath + , {doNOP, 60 /* < */, 45,0, TRUE} // 33 + , {doNOP, 35 /* # */, 50, 2, TRUE} // 34 + , {doBeginMatchMode, 105 /* i */, 53,0, FALSE} // 35 + , {doBeginMatchMode, 109 /* m */, 53,0, FALSE} // 36 + , {doBeginMatchMode, 115 /* s */, 53,0, FALSE} // 37 + , {doBeginMatchMode, 119 /* w */, 53,0, FALSE} // 38 + , {doBeginMatchMode, 120 /* x */, 53,0, FALSE} // 39 + , {doBeginMatchMode, 45 /* - */, 53,0, FALSE} // 40 + , {doConditionalExpr, 40 /* ( */, 104,0, TRUE} // 41 + , {doPerlInline, 123 /* { */, 104,0, TRUE} // 42 + , {doNOP, 80 /* P */, 48,0, TRUE} // 43 + , {doBadOpenParenType, 255, 104,0, FALSE} // 44 + , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 45 open-paren-lookbehind + , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 46 + , {doBadOpenParenType, 255, 104,0, FALSE} // 47 + , {doOpenPyNamedParen, 60 /* < */, 2, 14, FALSE} // 48 paren-python + , {doPyNamedBackRef, 61 /* = */, 14,0, FALSE} // 49 + , {doNOP, 41 /* ) */, 255,0, TRUE} // 50 paren-comment + , {doMismatchedParenErr, 253, 104,0, FALSE} // 51 + , {doNOP, 255, 50,0, TRUE} // 52 + , {doMatchMode, 105 /* i */, 53,0, TRUE} // 53 paren-flag + , {doMatchMode, 109 /* m */, 53,0, TRUE} // 54 + , {doMatchMode, 115 /* s */, 53,0, TRUE} // 55 + , {doMatchMode, 119 /* w */, 53,0, TRUE} // 56 + , {doMatchMode, 120 /* x */, 53,0, TRUE} // 57 + , {doMatchMode, 45 /* - */, 53,0, TRUE} // 58 + , {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 59 + , {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 60 + , {doBadModeFlag, 255, 104,0, FALSE} // 61 + , {doNGStar, 63 /* ? */, 20,0, TRUE} // 62 quant-star + , {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 63 + , {doStar, 255, 20,0, FALSE} // 64 + , {doNGPlus, 63 /* ? */, 20,0, TRUE} // 65 quant-plus + , {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 66 + , {doPlus, 255, 20,0, FALSE} // 67 + , {doNGOpt, 63 /* ? */, 20,0, TRUE} // 68 quant-opt + , {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 69 + , {doOpt, 255, 20,0, FALSE} // 70 + , {doNOP, 128, 71,0, TRUE} // 71 interval-open + , {doNOP, 129, 74,0, FALSE} // 72 + , {doIntervalError, 255, 104,0, FALSE} // 73 + , {doIntevalLowerDigit, 129, 74,0, TRUE} // 74 interval-lower + , {doNOP, 44 /* , */, 78,0, TRUE} // 75 + , {doIntervalSame, 125 /* } */, 81,0, TRUE} // 76 + , {doIntervalError, 255, 104,0, FALSE} // 77 + , {doIntervalUpperDigit, 129, 78,0, TRUE} // 78 interval-upper + , {doNOP, 125 /* } */, 81,0, TRUE} // 79 + , {doIntervalError, 255, 104,0, FALSE} // 80 + , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 81 interval-type + , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 82 + , {doInterval, 255, 20,0, FALSE} // 83 + , {doBackslashA, 65 /* A */, 2,0, TRUE} // 84 backslash + , {doBackslashB, 66 /* B */, 2,0, TRUE} // 85 + , {doBackslashb, 98 /* b */, 2,0, TRUE} // 86 + , {doBackslashd, 100 /* d */, 14,0, TRUE} // 87 + , {doBackslashD, 68 /* D */, 14,0, TRUE} // 88 + , {doBackslashG, 71 /* G */, 2,0, TRUE} // 89 + , {doProperty, 78 /* N */, 14,0, FALSE} // 90 + , {doProperty, 112 /* p */, 14,0, FALSE} // 91 + , {doProperty, 80 /* P */, 14,0, FALSE} // 92 + , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 93 + , {doBackslashS, 83 /* S */, 14,0, TRUE} // 94 + , {doBackslashs, 115 /* s */, 14,0, TRUE} // 95 + , {doBackslashW, 87 /* W */, 14,0, TRUE} // 96 + , {doBackslashw, 119 /* w */, 14,0, TRUE} // 97 + , {doBackslashX, 88 /* X */, 14,0, TRUE} // 98 + , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 99 + , {doBackslashz, 122 /* z */, 2,0, TRUE} // 100 + , {doBackRef, 129, 14,0, TRUE} // 101 + , {doEscapeError, 253, 104,0, FALSE} // 102 + , {doLiteralChar, 255, 14,0, TRUE} // 103 + , {doExit, 255, 104,0, TRUE} // 104 errorDeath }; static const char * const RegexStateNames[] = { 0, "start", @@ -250,9 +255,12 @@ 0, 0, 0, + 0, "open-paren-lookbehind", 0, 0, + "paren-python", + 0, "paren-comment", 0, 0, diff -r -u icu/source/i18n/regexcst.txt icu.new/source/i18n/regexcst.txt --- icu/source/i18n/regexcst.txt 2003-11-08 02:01:42.000000000 +0000 +++ icu.new/source/i18n/regexcst.txt 2006-08-06 12:09:38.000000000 +0100 @@ -1,4 +1,3 @@ - #***************************************************************************** # # Copyright (C) 2002-2003, International Business Machines Corporation and others. @@ -74,7 +73,7 @@ '\' n backslash '|' n term doOrOperator ')' n pop doCloseParen - eof term doPatFinish + eof term doPatFinish default errorDeath doRuleError @@ -140,13 +139,21 @@ '-' paren-flag doBeginMatchMode '(' n errorDeath doConditionalExpr '{' n errorDeath doPerlInline + 'P' n paren-python default errorDeath doBadOpenParenType open-paren-lookbehind: '=' n term ^expr-cont doOpenLookBehind # (?<= '!' n term ^expr-cont doOpenLookBehindNeg # (?...), or possibly a Python +# named group reference, e.g. (?P=id). +# +paren-python: + '<' term ^expr-quant doOpenPyNamedParen # (?P + '=' expr-quant doPyNamedBackRef # (?P=name) # # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' @@ -154,7 +161,7 @@ # paren-comment: ')' n pop - eof errorDeath doMismatchedParenErr + eof errorDeath doMismatchedParenErr default n paren-comment # @@ -213,7 +220,7 @@ interval-lower: digit_char n interval-lower doIntevalLowerDigit - ',' n interval-upper + ',' n interval-upper '}' n interval-type doIntervalSame # {n} default errorDeath doIntervalError @@ -250,9 +257,9 @@ 'X' n expr-quant doBackslashX 'Z' n term doBackslashZ 'z' n term doBackslashz - digit_char n expr-quant doBackRef # Will scan multiple digits + digit_char n expr-quant doBackRef # Will scan multiple digits eof errorDeath doEscapeError - default n expr-quant doLiteralChar # Escaped literal char. + default n expr-quant doLiteralChar # Escaped literal char. # diff -r -u icu/source/i18n/regeximp.h icu.new/source/i18n/regeximp.h --- icu/source/i18n/regeximp.h 2004-12-30 07:25:50.000000000 +0000 +++ icu.new/source/i18n/regeximp.h 2006-08-06 13:31:08.000000000 +0100 @@ -168,9 +168,12 @@ // Operand value: // 0: Normal (. doesn't match new-line) mode. // 1: . matches new-line mode. - URX_BACKSLASH_BU = 53 // \b or \B in UREGEX_UWORD mode, using Unicode style + URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style // word boundaries. - + URX_NAME_CAPTURE = 54, // When processing the next URX_END_CAPTURE, mark the + // group as matching the specified name. + // Parameter is the group name index. + URX_NAMEDBACKREF = 55, // A back-reference to the named group }; // Keep this list of opcode names in sync with the above enum @@ -229,8 +232,9 @@ "LOOP_SR_I", \ "LOOP_C", \ "LOOP_DOT_I", \ - "BACKSLASH_BU" - + "BACKSLASH_BU", \ + "NAME_CAPTURE", \ + "NAMEDBACKREF" // // Convenience macros for assembling and disassembling a compiled operation. diff -r -u icu/source/i18n/rematch.cpp icu.new/source/i18n/rematch.cpp --- icu/source/i18n/rematch.cpp 2005-06-17 01:42:54.000000000 +0100 +++ icu.new/source/i18n/rematch.cpp 2006-08-06 14:44:20.000000000 +0100 @@ -48,10 +48,12 @@ fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; return; } + fNamedGroups = new UVector32(pat->fNamedGroups->size(), + fDeferredStatus); if (pat->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { fData = (int32_t *)uprv_malloc(pat->fDataSize * sizeof(int32_t)); } - if (fStack == NULL || fData == NULL) { + if (fStack == NULL || fData == NULL || fNamedGroups == NULL) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; } @@ -73,10 +75,12 @@ if (U_FAILURE(status)) { return; } + fNamedGroups = new UVector32(fPatternOwned->fNamedGroups->size(), + fDeferredStatus); if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); } - if (fStack == NULL || fData == NULL) { + if (fStack == NULL || fData == NULL || fNamedGroups == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } reset(input); @@ -96,11 +100,12 @@ if (U_FAILURE(status)) { return; } - + fNamedGroups = new UVector32(fPatternOwned->fNamedGroups->size(), + fDeferredStatus); if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(int32_t))) { fData = (int32_t *)uprv_malloc(fPattern->fDataSize * sizeof(int32_t)); } - if (fStack == NULL || fData == NULL) { + if (fStack == NULL || fData == NULL || fNamedGroups == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } reset(*RegexStaticSets::gStaticSets->fEmptyString); @@ -119,6 +124,8 @@ fPatternOwned = NULL; fPattern = NULL; } + delete fNamedGroups; + fNamedGroups = NULL; #if UCONFIG_NO_BREAK_ITERATION==0 delete fWordBreakItr; #endif @@ -299,6 +306,43 @@ +int32_t RegexMatcher::end(const UnicodeString &groupName, UErrorCode &err) const { + if (U_FAILURE(err)) { + return -1; + } + if (fMatch == FALSE) { + err = U_REGEX_INVALID_STATE; + return -1; + } + + int32_t namedGroupCount = fPattern->fNamedGroups->size(); + int32_t nameIndex; + int32_t groupOffset = -1; + for (nameIndex = 0; nameIndex < namedGroupCount; ++nameIndex) { + UnicodeString *name = (UnicodeString *)fPattern->fNamedGroups->elementAt(nameIndex); + + if (*name == groupName) { + groupOffset = fNamedGroups->elementAti(nameIndex); + break; + } + } + + if (nameIndex >= namedGroupCount) { + err = U_REGEX_UNKNOWN_GROUP_NAME; + return -1; + } + + if (groupOffset < 0) { + // The capture group wasn't part of the match + return -1; + } + + U_ASSERT(groupOffset < fPattern->fFrameSize); + + return fFrame->fExtra[groupOffset + 1]; +} + + //-------------------------------------------------------------------------------- // // find() @@ -541,6 +585,102 @@ } +UnicodeString RegexMatcher::group(const UnicodeString &groupName, + UErrorCode &status) const { + // We don't use start() and end() in here, because it would mean scanning + // the name list twice, which is unecessarily inefficient. + if (U_FAILURE(status)) { + return UnicodeString(); + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return UnicodeString(); + } + if (fMatch == FALSE) { + status = U_REGEX_INVALID_STATE; + return UnicodeString(); + } + + int32_t namedGroupCount = fPattern->fNamedGroups->size(); + int32_t nameIndex; + int32_t groupOffset = -1; + for (nameIndex = 0; nameIndex < namedGroupCount; ++nameIndex) { + UnicodeString *name = (UnicodeString *)fPattern->fNamedGroups->elementAt(nameIndex); + + if (*name == groupName) { + groupOffset = fNamedGroups->elementAti(nameIndex); + break; + } + } + + if (nameIndex >= namedGroupCount) { + status = U_REGEX_UNKNOWN_GROUP_NAME; + return UnicodeString(); + } + + if (groupOffset < 0) { + // The capture group wasn't part of the match + return UnicodeString(); + } + + U_ASSERT(groupOffset < fPattern->fFrameSize); + U_ASSERT(groupOffset >= 0); + + int32_t s = fFrame->fExtra[groupOffset]; + int32_t e = fFrame->fExtra[groupOffset + 1]; + + U_ASSERT(s <= e); + return UnicodeString (*fInput, s, e-s); +} + + +int32_t RegexMatcher::groupIndexFromName(const UnicodeString &groupName, + UErrorCode &status) const { + if (U_FAILURE(status)) { + return -1; + } + if (U_FAILURE(fDeferredStatus)) { + status = fDeferredStatus; + return -1; + } + + // First, find the name in the name list + int32_t namedGroupCount = fPattern->fNamedGroups->size(); + int32_t nameIndex; + int32_t groupOffset = -1; + for (nameIndex = 0; nameIndex < namedGroupCount; ++nameIndex) { + UnicodeString *name = (UnicodeString *)fPattern->fNamedGroups->elementAt(nameIndex); + + if (*name == groupName) { + groupOffset = fNamedGroups->elementAti(nameIndex); + break; + } + } + + if (nameIndex >= namedGroupCount) { + status = U_REGEX_UNKNOWN_GROUP_NAME; + return -1; + } + + // If this group wasn't matched, return 0 + if (groupOffset < 0) + return 0; + + // Next, locate the group index by scanning the group map + int32_t groupCount = fPattern->fGroupMap->size(); + int32_t groupIndex; + for (groupIndex = 0; groupIndex < groupCount; ++groupIndex) { + int32_t groupPtr = fPattern->fGroupMap->elementAti(groupIndex); + + if (groupPtr == groupOffset) { + return groupIndex + 1; + } + } + + status = U_REGEX_UNKNOWN_GROUP_NAME; + return -1; +} + int32_t RegexMatcher::groupCount() const { @@ -858,7 +998,6 @@ - int32_t RegexMatcher::start(int group, UErrorCode &status) const { if (U_FAILURE(status)) { return -1; @@ -889,6 +1028,44 @@ +int32_t RegexMatcher::start(const UnicodeString &groupName, + UErrorCode &err) const { + if (U_FAILURE(err)) { + return -1; + } + if (fMatch == FALSE) { + err = U_REGEX_INVALID_STATE; + return -1; + } + + int32_t namedGroupCount = fPattern->fNamedGroups->size(); + int32_t nameIndex; + int32_t groupOffset = -1; + for (nameIndex = 0; nameIndex < namedGroupCount; ++nameIndex) { + UnicodeString *name = (UnicodeString *)fPattern->fNamedGroups->elementAt(nameIndex); + + if (*name == groupName) { + groupOffset = fNamedGroups->elementAti(nameIndex); + break; + } + } + + if (nameIndex >= namedGroupCount) { + err = U_REGEX_UNKNOWN_GROUP_NAME; + return -1; + } + + if (groupOffset < 0) { + // The capture group wasn't part of the match + return -1; + } + + U_ASSERT(groupOffset < fPattern->fFrameSize); + + return fFrame->fExtra[groupOffset]; +} + + //================================================================================ // // Code following this point in this file is the internal @@ -1093,6 +1270,18 @@ fData[i] = 0; } + // A string to hold the name of the capturing group + int32_t captureNameIdx = -1; + + // Reset the named group indices + int32_t namedGroupCount = fPattern->fNamedGroups->size(); + if (!fNamedGroups->ensureCapacity(namedGroupCount, status)) + return; + fNamedGroups->setSize(namedGroupCount); + for (i = 0; i < namedGroupCount; ++i) { + fNamedGroups->setElementAt (-1, i); + } + // // Main loop for interpreting the compiled pattern. // One iteration of the loop per pattern operation performed. @@ -1210,12 +1399,22 @@ break; + case URX_NAME_CAPTURE: + U_ASSERT(opValue >= 0 && opValue < namedGroupCount); + captureNameIdx = opValue; + break; + + case URX_END_CAPTURE: U_ASSERT(opValue >= 0 && opValue < frameSize-3); U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. fp->fExtra[opValue+1] = fp->fInputIdx; // End position U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); + if (captureNameIdx >= 0) { + fNamedGroups->setElementAt(opValue, captureNameIdx); + captureNameIdx = -1; + } break; @@ -1794,6 +1993,19 @@ } break; + case URX_NAMEDBACKREF: + { + U_ASSERT(opValue >= 0 && opValue <= namedGroupCount); + + opValue = fNamedGroups->elementAti(opValue); + + if (opValue < 0) { + // This group hasn't been matched yet + fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match. + break; + } + // Fall through + } case URX_BACKREF: case URX_BACKREF_I: { @@ -1816,7 +2028,7 @@ UBool haveMatch = FALSE; if (fp->fInputIdx + len <= inputLen) { - if (opType == URX_BACKREF) { + if (opType == URX_BACKREF || opType == URX_NAMEDBACKREF) { if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) == 0) { haveMatch = TRUE; } diff -r -u icu/source/i18n/repattrn.cpp icu.new/source/i18n/repattrn.cpp --- icu/source/i18n/repattrn.cpp 2004-12-30 07:25:50.000000000 +0000 +++ icu.new/source/i18n/repattrn.cpp 2006-08-06 14:45:20.000000000 +0100 @@ -57,6 +57,11 @@ // Assignmenet Operator // //-------------------------------------------------------------------------- +static void copyStringTok(UHashTok *a, UHashTok *b) +{ + a->pointer = new UnicodeString (*(UnicodeString *)b->pointer); +} + RegexPattern &RegexPattern::operator = (const RegexPattern &other) { if (this == &other) { // Source and destination are the same. Don't do anything. @@ -91,6 +96,7 @@ // Copy the pattern. It's just values, nothing deep to copy. fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus); fGroupMap->assign(*other.fGroupMap, fDeferredStatus); + fNamedGroups->assign(*other.fNamedGroups, (UTokenAssigner *)copyStringTok, fDeferredStatus); // Copy the Unicode Sets. // Could be made more efficient if the sets were reference counted and shared, @@ -135,6 +141,7 @@ fFrameSize = 0; fDataSize = 0; fGroupMap = NULL; + fNamedGroups = NULL; fMaxCaptureDigits = 1; fStaticSets = NULL; fStaticSets8 = NULL; @@ -147,13 +154,14 @@ fCompiledPat = new UVector32(fDeferredStatus); fGroupMap = new UVector32(fDeferredStatus); + fNamedGroups = new UVector(fDeferredStatus); fSets = new UVector(fDeferredStatus); fInitialChars = new UnicodeSet; fInitialChars8 = new Regex8BitSet; if (U_FAILURE(fDeferredStatus)) { return; } - if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || + if (fCompiledPat == NULL || fGroupMap == NULL || fNamedGroups == NULL || fSets == NULL || fInitialChars == NULL || fInitialChars8 == NULL) { fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; return; @@ -186,6 +194,15 @@ fSets8 = NULL; delete fGroupMap; fGroupMap = NULL; + for (i = 1; i < fNamedGroups->size(); ++i) { + UnicodeString *s; + s = (UnicodeString *)fNamedGroups->elementAt(i); + if (s != NULL) { + delete s; + } + } + delete fNamedGroups; + fNamedGroups = NULL; delete fInitialChars; fInitialChars = NULL; delete fInitialChars8; diff -r -u icu/source/i18n/unicode/regex.h icu.new/source/i18n/unicode/regex.h --- icu/source/i18n/unicode/regex.h 2005-06-23 00:38:06.000000000 +0100 +++ icu.new/source/i18n/unicode/regex.h 2006-08-06 14:45:56.000000000 +0100 @@ -405,6 +405,7 @@ UVector32 *fGroupMap; // Map from capture group number to position of // the group's variables in the matcher stack frame. + UVector *fNamedGroups; // A vector of group names. int32_t fMaxCaptureDigits; @@ -622,6 +623,32 @@ /** + * Returns a string containing the text captured by the given group + * during the previous match operation. + * + * @param groupName the capture group name + * @param status A reference to a UErrorCode to receive any errors. + * Possible errors are U_REGEX_INVALID_STATE if no match + * has been attempted or the last match failed and + * U_REGEX_UNKNOWN_GROUP_NAME for a bad capture group name. + * @return the captured text + */ + virtual UnicodeString group(const UnicodeString &groupName, + UErrorCode &status) const; + + /** + * Returns the group index for the named group with the specified name. + * + * @param groupName the capture group name + * @param status A reference to a UErrorCode to receive any errors. + * + * @return the index of the group, or a -ve number to indicate failure. + * + */ + virtual int32_t groupIndexFromName(const UnicodeString &groupName, + UErrorCode &status) const; + + /** * Returns the number of capturing groups in this matcher's pattern. * @return the number of capture groups * @stable ICU 2.4 @@ -656,6 +683,22 @@ /** + * Returns the index in the input string of the start of the text matched by the + * specified capture group during the previous match operation. Return -1 if + * the capture group exists in the pattern, but was not part of the last match. + * + * @param groupName the capture group name + * @param status A reference to a UErrorCode to receive any errors. Possible + * errors are U_REGEX_INVALID_STATE if no match has been + * attempted or the last match failed, and + * U_REGEX_UNKNOWN_GROUP_NAME for a bad capture group number + * @return the start position of substring matched by the specified group. + */ + virtual int32_t start(const UnicodeString &groupName, + UErrorCode &status) const; + + + /** * Returns the index in the input string of the first character following the * text matched during the previous match operation. * @param status A reference to a UErrorCode to receive any errors. Possible @@ -684,6 +727,21 @@ /** + * Returns the index in the input string of the character following the + * text matched by the specified capture group during the previous match operation. + * @param groupName the capture group name + * @param status A reference to a UErrorCode to receive any errors. Possible + * errors are U_REGEX_INVALID_STATE if no match has been + * attempted or the last match failed and + * U_REGEX_UNKNOWN_GROUP_NAME for a bad capture group number + * @return the index of the first character following the text + * captured by the specifed group during the previous match operation. + * Return -1 if the capture group exists in the pattern but was not part of the match. + */ + virtual int32_t end(const UnicodeString &groupName, UErrorCode &status) const; + + + /** * Resets this matcher. The effect is to remove any memory of previous matches, * and to cause subsequent find() operations to begin at the beginning of * the input string. @@ -935,6 +993,8 @@ int32_t *fData; // Data area for use by the compiled pattern. int32_t fSmallData[8]; // Use this for data if it's enough. + UVector32 *fNamedGroups; // A vector of indices, one for each in the pattern. + UBool fTraceDebug; // Set true for debug tracing of match engine. UErrorCode fDeferredStatus; // Save error state if that cannot be immediately diff -r -u icu/source/i18n/unicode/uregex.h icu.new/source/i18n/unicode/uregex.h --- icu/source/i18n/unicode/uregex.h 2005-06-28 23:23:06.000000000 +0100 +++ icu.new/source/i18n/unicode/uregex.h 2006-08-06 14:46:26.000000000 +0100 @@ -332,7 +332,8 @@ uregex_groupCount(URegularExpression *regexp, UErrorCode *status); -/** Extract the string for the specified matching expression or subexpression. +/** + * Extract the string for the specified matching expression or subexpression. * Group #0 is the complete string of matched text. * Group #1 is the text matched by the first set of capturing parentheses. * @@ -355,6 +356,24 @@ int32_t destCapacity, UErrorCode *status); +/** + * Returns the group index for the named group with the specified name, + * provided that the named group was matched by the regular expression. + * + * @param regexp The compiled regular expression. + * @param name The capture group name. + * @param status A reference to a UErrorCode to receive any errors. + * @return if the named group was matched, the index of the + * corresponding numbered group, otherwise 0. + * If there is no group with the specified name, + * -1 is returned with U_REGEX_UNKNOWN_GROUP_NAME in + * status. + */ +U_DRAFT int32_t U_EXPORT2 +uregex_groupIndexFromName(URegularExpression *regexp, + const UChar *name, + int32_t nameLen, + UErrorCode *status); /** * Returns the index in the input string of the start of the text matched by the diff -r -u icu/source/i18n/uregex.cpp icu.new/source/i18n/uregex.cpp --- icu/source/i18n/uregex.cpp 2005-06-21 17:54:40.000000000 +0100 +++ icu.new/source/i18n/uregex.cpp 2006-08-06 14:47:02.000000000 +0100 @@ -432,6 +432,28 @@ //------------------------------------------------------------------------------ // +// uregex_groupIndexFromName +// +//------------------------------------------------------------------------------ +U_CAPI int32_t U_EXPORT2 +uregex_groupIndexFromName(URegularExpression *regexp, + const UChar *name, + int32_t nameLen, + UErrorCode *status) { + if (name == NULL || nameLen < -1) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + // Make a read-only-aliased copy of the name string. + UnicodeString nameString(nameLen == -1, name, nameLen); + + return regexp->fMatcher->groupIndexFromName(nameString, *status); +} + + +//------------------------------------------------------------------------------ +// // uregex_start // //------------------------------------------------------------------------------ diff -r -u icu/source/test/cintltst/reapits.c icu.new/source/test/cintltst/reapits.c --- icu/source/test/cintltst/reapits.c 2004-12-16 02:54:24.000000000 +0000 +++ icu.new/source/test/cintltst/reapits.c 2006-08-06 14:29:29.000000000 +0100 @@ -531,6 +531,50 @@ } /* + * groupIndexFromName() + */ + { + UChar text1[80]; + UChar buf[80]; + UBool result; + int32_t groupIndex; + + u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); + + status = U_ZERO_ERROR; + re = uregex_openC("(?:abc(?P.*?)def|(?Pergodic))", + 0, NULL, &status); + TEST_ASSERT_SUCCESS(status); + + uregex_setText(re, text1, -1, &status); + result = uregex_find(re, 0, &status); + TEST_ASSERT(result == TRUE); + + /* Capture group "foo". Should succeed. */ + status = U_ZERO_ERROR; + u_uastrncpy(buf, "foo", sizeof(buf)/2); + groupIndex = uregex_groupIndexFromName(re, buf, -1, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(groupIndex == 1); + + /* Capture group "bar". Should succeed and return 0. */ + status = U_ZERO_ERROR; + u_uastrncpy(buf, "bar", sizeof(buf)/2); + groupIndex = uregex_groupIndexFromName(re, buf, -1, &status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(groupIndex == 0); + + /* Capture group "pants". Should fail. */ + status = U_ZERO_ERROR; + u_uastrncpy(buf, "pants", sizeof(buf)/2); + groupIndex = uregex_groupIndexFromName(re, buf, -1, &status); + TEST_ASSERT(status == U_REGEX_UNKNOWN_GROUP_NAME); + TEST_ASSERT(groupIndex == -1); + + uregex_close(re); + } + + /* * replaceFirst() */ { diff -r -u icu/source/test/intltest/regextst.cpp icu.new/source/test/intltest/regextst.cpp --- icu/source/test/intltest/regextst.cpp 2005-07-05 19:39:00.000000000 +0100 +++ icu.new/source/test/intltest/regextst.cpp 2006-08-06 14:47:53.000000000 +0100 @@ -67,7 +67,6 @@ if (exec) PerlTests(); break; - default: name = ""; break; //needed to end loop } @@ -775,6 +774,39 @@ } + // Named capture group + { + int32_t flags = 0; + UParseError pe; + UErrorCode status = U_ZERO_ERROR; + + UnicodeString re("01(?P23(?P45)67)(?P=bar)" + "(?Pkerpow)?"); + RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); + REGEX_CHECK_STATUS; + UnicodeString data = "0123456745"; + + RegexMatcher *matcher = pat->matcher(data, status); + REGEX_CHECK_STATUS; + REGEX_ASSERT(matcher->lookingAt(status) == TRUE); + + REGEX_ASSERT(matcher->groupIndexFromName("foo", status) == 1); + REGEX_ASSERT(matcher->groupIndexFromName("bar", status) == 2); + REGEX_ASSERT(matcher->group(0, status) == "0123456745"); + REGEX_ASSERT(matcher->group(1, status) == "234567"); + REGEX_ASSERT(matcher->group(2, status) == "45"); + REGEX_ASSERT(matcher->group("foo", status) == "234567"); + REGEX_ASSERT(matcher->group("bar", status) == "45"); + REGEX_ASSERT(matcher->group("blat", status) == -1); + REGEX_ASSERT(matcher->start("foo", status) == 2); + REGEX_ASSERT(matcher->end("foo", status) == 8); + REGEX_ASSERT(matcher->start("bar", status) == 4); + REGEX_ASSERT(matcher->end("bar", status) == 6); + REGEX_CHECK_STATUS; + REGEX_ASSERT_FAIL(matcher->group("pants", status), + U_REGEX_UNKNOWN_GROUP_NAME); + } + // // find //