| 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> | 
|---|
| 2 | <html> | 
|---|
| 3 |    <head> | 
|---|
| 4 |       <title>Boost.Regex: Working With Unicode and ICU String Types</title> | 
|---|
| 5 |       <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"> | 
|---|
| 6 |       <LINK href="../../../boost.css" type="text/css" rel="stylesheet"></head> | 
|---|
| 7 |    <body> | 
|---|
| 8 |       <P> | 
|---|
| 9 |          <TABLE id="Table1" cellSpacing="1" cellPadding="1" width="100%" border="0"> | 
|---|
| 10 |             <TR> | 
|---|
| 11 |                <td vAlign="top" width="300"> | 
|---|
| 12 |                   <h3><A href="../../../index.htm"><IMG height="86" alt="C++ Boost" src="../../../boost.png" width="277" border="0"></A></h3> | 
|---|
| 13 |                </td> | 
|---|
| 14 |                <TD width="353"> | 
|---|
| 15 |                   <H1 align="center">Boost.Regex</H1> | 
|---|
| 16 |                   <H2 align="center">Working With Unicode and ICU String Types.</H2> | 
|---|
| 17 |                </TD> | 
|---|
| 18 |                <td width="50"> | 
|---|
| 19 |                   <h3><A href="index.html"><IMG height="45" alt="Boost.Regex Index" src="uarrow.gif" width="43" border="0"></A></h3> | 
|---|
| 20 |                </td> | 
|---|
| 21 |             </TR> | 
|---|
| 22 |          </TABLE> | 
|---|
| 23 |       </P> | 
|---|
| 24 |       <HR> | 
|---|
| 25 |       <p></p> | 
|---|
| 26 |       <H3>Contents</H3> | 
|---|
| 27 |       <dl class="index"> | 
|---|
| 28 |          <dt><a href="#introduction">Introduction</a></dt>  | 
|---|
| 29 |          <dt><a href="#types">Unicode regular expression types</a></dt>  | 
|---|
| 30 |          <dt><a href="#algo">Regular Expression Algorithms</a> | 
|---|
| 31 |             <dd> | 
|---|
| 32 |                <dl class="index"> | 
|---|
| 33 |                   <dt><a href="#u32regex_match">u32regex_match</a></dt>  | 
|---|
| 34 |                   <dt><a href="#u32regex_search">u32regex_search</a></dt>  | 
|---|
| 35 |                   <dt><a href="#u32regex_replace">u32regex_replace</a></dt>  | 
|---|
| 36 |                </dl> | 
|---|
| 37 |             </dd> | 
|---|
| 38 |          </dt> | 
|---|
| 39 |          <dt><a href="#iterators">Iterators</a> | 
|---|
| 40 |             <dd> | 
|---|
| 41 |                <dl class="index"> | 
|---|
| 42 |                   <dt><a href="#u32regex_iterator">u32regex_iterator</a></dt>  | 
|---|
| 43 |                   <dt><a href="#u32regex_token_iterator">u32regex_token_iterator</a></dt>  | 
|---|
| 44 |                </dl> | 
|---|
| 45 |             </dd> | 
|---|
| 46 |          </dt> | 
|---|
| 47 |       </dl> | 
|---|
| 48 |       <H3><A name="introduction"></A>Introduction</H3> | 
|---|
| 49 |       <P>The header:</P> | 
|---|
| 50 |       <PRE><boost/regex/icu.hpp></PRE> | 
|---|
| 51 |       <P>contains the data types and algorithms necessary for working with regular  | 
|---|
| 52 |          expressions in a Unicode aware environment.  | 
|---|
| 53 |       </P> | 
|---|
| 54 |       <P>In order to use this header you will need <A href="http://www.ibm.com/software/globalization/icu/"> | 
|---|
| 55 |             the ICU library</A>, and you will need to have built the Boost.Regex library  | 
|---|
| 56 |          with <A href="install.html#unicode">ICU support enabled</A>.</P> | 
|---|
| 57 |       <P>The header will enable you to:</P> | 
|---|
| 58 |       <UL> | 
|---|
| 59 |          <LI> | 
|---|
| 60 |          Create regular expressions that treat Unicode strings as sequences of UTF-32  | 
|---|
| 61 |          code points. | 
|---|
| 62 |          <LI> | 
|---|
| 63 |          Create regular expressions that support various Unicode data properties,  | 
|---|
| 64 |          including character classification. | 
|---|
| 65 |          <LI> | 
|---|
| 66 |             Transparently search Unicode strings that are encoded as either UTF-8, UTF-16  | 
|---|
| 67 |             or UTF-32.</LI></UL> | 
|---|
| 68 |       <H3><A name="types"></A>Unicode regular expression types</H3> | 
|---|
| 69 |       <P>Header <boost/regex/icu.hpp> provides a regular expression traits  | 
|---|
| 70 |          class that handles UTF-32 characters:</P> | 
|---|
| 71 |       <PRE>class icu_regex_traits;</PRE> | 
|---|
| 72 |       <P>and a regular expression type based upon that:</P> | 
|---|
| 73 |       <PRE>typedef basic_regex<UChar32,icu_regex_traits> u32regex;</PRE> | 
|---|
| 74 |       <P>The type <EM>u32regex</EM> is regular expression type to use for all Unicode  | 
|---|
| 75 |          regular expressions; internally it uses UTF-32 code points, but can be created  | 
|---|
| 76 |          from, and used to search, either UTF-8, or UTF-16 encoded strings as well as  | 
|---|
| 77 |          UTF-32 ones.</P> | 
|---|
| 78 |       <P>The <A href="basic_regex.html#c2">constructors</A>, and <A href="basic_regex.html#a1"> | 
|---|
| 79 |             assign</A> member functions of u32regex, require UTF-32 encoded strings, but  | 
|---|
| 80 |          there are a series of overloaded algorithms called make_u32regex which allow  | 
|---|
| 81 |          regular expressions to be created from UTF-8, UTF-16, or UTF-32 encoded  | 
|---|
| 82 |          strings:</P> | 
|---|
| 83 |       <PRE>template <class InputIterator>  | 
|---|
| 84 | u32regex make_u32regex(InputIterator i, InputIterator j, boost::regex_constants::syntax_option_type opt); | 
|---|
| 85 | </PRE> | 
|---|
| 86 |       <P><STRONG>Effects:</STRONG> Creates a regular expression object from the iterator  | 
|---|
| 87 |          sequence [i,j). The character encoding of the sequence is determined based upon <code> | 
|---|
| 88 |             sizeof(*i)</code>: 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.</P> | 
|---|
| 89 |       <PRE>u32regex make_u32regex(const char* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl); | 
|---|
| 90 | </PRE> | 
|---|
| 91 |       <P><STRONG>Effects:</STRONG> Creates a regular expression object from the  | 
|---|
| 92 |          Null-terminated UTF-8 characater sequence <EM>p</EM>.</P> | 
|---|
| 93 |       <PRE>u32regex make_u32regex(const unsigned char* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);</PRE> | 
|---|
| 94 |       <P><STRONG>Effects:</STRONG> Creates a regular expression object from the  | 
|---|
| 95 |          Null-terminated UTF-8 characater sequence <EM>p</EM>.u32regex  | 
|---|
| 96 |          make_u32regex(const wchar_t* p, boost::regex_constants::syntax_option_type opt  | 
|---|
| 97 |          = boost::regex_constants::perl);</P> | 
|---|
| 98 |       <P><STRONG>Effects:</STRONG> Creates a regular expression object from the  | 
|---|
| 99 |          Null-terminated characater sequence <EM>p</EM>.  The character encoding of  | 
|---|
| 100 |          the sequence is determined based upon <CODE>sizeof(wchar_t)</CODE>: 1 implies  | 
|---|
| 101 |          UTF-8, 2 implies UTF-16, and 4 implies UTF-32.</P> | 
|---|
| 102 |       <PRE>u32regex make_u32regex(const UChar* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);</PRE> | 
|---|
| 103 |       <P><STRONG>Effects:</STRONG> Creates a regular expression object from the  | 
|---|
| 104 |          Null-terminated UTF-16 characater sequence <EM>p</EM>.</P> | 
|---|
| 105 |       <PRE>template<class C, class T, class A> | 
|---|
| 106 | u32regex make_u32regex(const std::basic_string<C, T, A>& s, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);</PRE> | 
|---|
| 107 |       <P><STRONG>Effects:</STRONG> Creates a regular expression object from the string <EM>s</EM>.   | 
|---|
| 108 |          The character encoding of the string is determined based upon <CODE>sizeof(C)</CODE>:  | 
|---|
| 109 |          1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.</P> | 
|---|
| 110 |       <PRE>u32regex make_u32regex(const UnicodeString& s, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);</PRE> | 
|---|
| 111 |       <P><STRONG>Effects:</STRONG> Creates a regular expression object from the UTF-16  | 
|---|
| 112 |          encoding string <EM>s</EM>.</P> | 
|---|
| 113 |       <H3><A name="algo"></A>Regular Expression Algorithms</H3> | 
|---|
| 114 |       <P>The regular expression algorithms <A href="regex_match.html">regex_match</A>, <A href="regex_search.html"> | 
|---|
| 115 |             regex_search</A> and <A href="regex_replace.html">regex_replace</A> all  | 
|---|
| 116 |          expect that the character sequence upon which they operate, is encoded in the  | 
|---|
| 117 |          same character encoding as the regular expression object with which they are  | 
|---|
| 118 |          used.  For Unicode regular expressions that behavior is undesirable: while  | 
|---|
| 119 |          we may want to process the data in UTF-32 "chunks", the actual data is much  | 
|---|
| 120 |          more likely to encoded as either UTF-8 or UTF-16.  Therefore the header  | 
|---|
| 121 |          <boost/regex/icu.hpp> provides a series of thin wrappers around these  | 
|---|
| 122 |          algorithms, called u32regex_match, u32regex_search, and u32regex_replace.   | 
|---|
| 123 |          These wrappers use iterator-adapters internally to make external UTF-8 or  | 
|---|
| 124 |          UTF-16 data look as though it's really a UTF-32 sequence, that can then be  | 
|---|
| 125 |          passed on to the "real" algorithm.</P> | 
|---|
| 126 |       <H4><A name="u32regex_match"></A>u32regex_match</H4> | 
|---|
| 127 |       <P>For each <A href="regex_match.html">regex_match</A> algorithm defined by  | 
|---|
| 128 |          <boost/regex.hpp>, then <boost/regex/icu.hpp> defines an overloaded  | 
|---|
| 129 |          algorithm that takes the same arguments, but which is called <EM>u32regex_match</EM>,  | 
|---|
| 130 |          and which will accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an  | 
|---|
| 131 |          ICU UnicodeString as input.</P> | 
|---|
| 132 |       <P><STRONG>Example: </STRONG>match a password, encoded in a UTF-16 UnicodeString:</P> | 
|---|
| 133 |       <PRE>// | 
|---|
| 134 | // Find out if *password* meets our password requirements, | 
|---|
| 135 | // as defined by the regular expression *requirements*. | 
|---|
| 136 | // | 
|---|
| 137 | bool is_valid_password(const UnicodeString& password, const UnicodeString& requirements) | 
|---|
| 138 | { | 
|---|
| 139 |    return boost::u32regex_match(password, boost::make_u32regex(requirements)); | 
|---|
| 140 | } | 
|---|
| 141 | </PRE> | 
|---|
| 142 |       <P> | 
|---|
| 143 |       <P><STRONG>Example: </STRONG>match a UTF-8 encoded filename:</P> | 
|---|
| 144 |       <PRE>// | 
|---|
| 145 | // Extract filename part of a path from a UTF-8 encoded std::string and return the result | 
|---|
| 146 | // as another std::string: | 
|---|
| 147 | // | 
|---|
| 148 | std::string get_filename(const std::string& path) | 
|---|
| 149 | { | 
|---|
| 150 |    boost::u32regex r = boost::make_u32regex("(?:\\A|.*\\\\)([^\\\\]+)"); | 
|---|
| 151 |    boost::smatch what; | 
|---|
| 152 |    if(boost::u32regex_match(path, what, r)) | 
|---|
| 153 |    { | 
|---|
| 154 |       // extract $1 as a CString: | 
|---|
| 155 |       return what.str(1); | 
|---|
| 156 |    } | 
|---|
| 157 |    else | 
|---|
| 158 |    { | 
|---|
| 159 |       throw std::runtime_error("Invalid pathname"); | 
|---|
| 160 |    } | 
|---|
| 161 | } | 
|---|
| 162 | </PRE> | 
|---|
| 163 |       <H4><A name="u32regex_search"></A>u32regex_search</H4> | 
|---|
| 164 |       <P>For each <A href="regex_search.html">regex_search</A> algorithm defined by  | 
|---|
| 165 |          <boost/regex.hpp>, then <boost/regex/icu.hpp> defines an overloaded  | 
|---|
| 166 |          algorithm that takes the same arguments, but which is called <EM>u32regex_search</EM>,  | 
|---|
| 167 |          and which will accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an  | 
|---|
| 168 |          ICU UnicodeString as input.</P> | 
|---|
| 169 |       <P><STRONG>Example: </STRONG>search for a character sequence in a specific  | 
|---|
| 170 |          language block: | 
|---|
| 171 |       </P> | 
|---|
| 172 |       <PRE>UnicodeString extract_greek(const UnicodeString& text) | 
|---|
| 173 | { | 
|---|
| 174 |    // searches through some UTF-16 encoded text for a block encoded in Greek, | 
|---|
| 175 |    // this expression is imperfect, but the best we can do for now - searching | 
|---|
| 176 |    // for specific scripts is actually pretty hard to do right. | 
|---|
| 177 |    // | 
|---|
| 178 |    // Here we search for a character sequence that begins with a Greek letter, | 
|---|
| 179 |    // and continues with characters that are either not-letters ( [^[:L*:]] ) | 
|---|
| 180 |    // or are characters in the Greek character block ( [\\x{370}-\\x{3FF}] ). | 
|---|
| 181 |    // | 
|---|
| 182 |    boost::u32regex r = boost::make_u32regex(L"[\\x{370}-\\x{3FF}](?:[^[:L*:]]|[\\x{370}-\\x{3FF}])*"); | 
|---|
| 183 |    boost::u16match what; | 
|---|
| 184 |    if(boost::u32regex_search(text, what, r)) | 
|---|
| 185 |    { | 
|---|
| 186 |       // extract $0 as a CString: | 
|---|
| 187 |       return UnicodeString(what[0].first, what.length(0)); | 
|---|
| 188 |    } | 
|---|
| 189 |    else | 
|---|
| 190 |    { | 
|---|
| 191 |       throw std::runtime_error("No Greek found!"); | 
|---|
| 192 |    } | 
|---|
| 193 | }</PRE> | 
|---|
| 194 |       <H4><A name="u32regex_replace"></A>u32regex_replace</H4> | 
|---|
| 195 |       <P>For each <A href="regex_replace.html">regex_replace</A> algorithm defined by  | 
|---|
| 196 |          <boost/regex.hpp>, then <boost/regex/icu.hpp> defines an overloaded  | 
|---|
| 197 |          algorithm that takes the same arguments, but which is called <EM>u32regex_replace</EM>,  | 
|---|
| 198 |          and which will accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an  | 
|---|
| 199 |          ICU UnicodeString as input.  The input sequence and the format string  | 
|---|
| 200 |          specifier passed to the algorithm, can be encoded independently (for example  | 
|---|
| 201 |          one can be UTF-8, the other in UTF-16), but the result string / output iterator  | 
|---|
| 202 |          argument must use the same character encoding as the text being searched.</P> | 
|---|
| 203 |       <P><STRONG>Example: </STRONG>Credit card number reformatting:</P> | 
|---|
| 204 |       <PRE>// | 
|---|
| 205 | // Take a credit card number as a string of digits,  | 
|---|
| 206 | // and reformat it as a human readable string with "-" | 
|---|
| 207 | // separating each group of four digit;,  | 
|---|
| 208 | // note that we're mixing a UTF-32 regex, with a UTF-16 | 
|---|
| 209 | // string and a UTF-8 format specifier, and it still all  | 
|---|
| 210 | // just works: | 
|---|
| 211 | // | 
|---|
| 212 | const boost::u32regex e = boost::make_u32regex("\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z"); | 
|---|
| 213 | const char* human_format = "$1-$2-$3-$4"; | 
|---|
| 214 |  | 
|---|
| 215 | UnicodeString human_readable_card_number(const UnicodeString& s) | 
|---|
| 216 | { | 
|---|
| 217 |    return boost::u32regex_replace(s, e, human_format); | 
|---|
| 218 | }</PRE> | 
|---|
| 219 |       <P> | 
|---|
| 220 |          <H2><A name="iterators"></A>Iterators</H2> | 
|---|
| 221 |          <H3><A name="u32regex_iterator"></A>u32regex_iterator</H3> | 
|---|
| 222 |       <P>Type u32regex_iterator is in all respects the same as <A href="regex_iterator.html"> | 
|---|
| 223 |             regex_iterator</A> except that since the regular expression type is always  | 
|---|
| 224 |          u32regex it only takes one template parameter (the iterator type). It also  | 
|---|
| 225 |          calls u32regex_search internally, allowing it to interface correctly with  | 
|---|
| 226 |          UTF-8, UTF-16, and UTF-32 data:</P> | 
|---|
| 227 |       <PRE> | 
|---|
| 228 | template <class BidirectionalIterator> | 
|---|
| 229 | class u32regex_iterator | 
|---|
| 230 | { | 
|---|
| 231 |    // for members see <A href="regex_iterator.html">regex_iterator</A> | 
|---|
| 232 | }; | 
|---|
| 233 |  | 
|---|
| 234 | typedef u32regex_iterator<const char*>     utf8regex_iterator; | 
|---|
| 235 | typedef u32regex_iterator<const UChar*>    utf16regex_iterator; | 
|---|
| 236 | typedef u32regex_iterator<const UChar32*>  utf32regex_iterator; | 
|---|
| 237 | </PRE> | 
|---|
| 238 |       <P>In order to simplify the construction of a u32regex_iterator from a string,  | 
|---|
| 239 |          there are a series of non-member helper functions called  | 
|---|
| 240 |          make_u32regex_iterator:</P> | 
|---|
| 241 |       <PRE> | 
|---|
| 242 | u32regex_iterator<const char*>  | 
|---|
| 243 |    make_u32regex_iterator(const char* s,  | 
|---|
| 244 |                           const u32regex& e,  | 
|---|
| 245 |                           regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 246 |                            | 
|---|
| 247 | u32regex_iterator<const wchar_t*>  | 
|---|
| 248 |    make_u32regex_iterator(const wchar_t* s,  | 
|---|
| 249 |                           const u32regex& e,  | 
|---|
| 250 |                           regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 251 |                            | 
|---|
| 252 | u32regex_iterator<const UChar*>  | 
|---|
| 253 |    make_u32regex_iterator(const UChar* s,  | 
|---|
| 254 |                           const u32regex& e,  | 
|---|
| 255 |                           regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 256 |                            | 
|---|
| 257 | template <class charT, class Traits, class Alloc> | 
|---|
| 258 | u32regex_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>  | 
|---|
| 259 |    make_u32regex_iterator(const std::basic_string<charT, Traits, Alloc>& s,  | 
|---|
| 260 |                           const u32regex& e,  | 
|---|
| 261 |                           regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 262 |                            | 
|---|
| 263 | u32regex_iterator<const UChar*>  | 
|---|
| 264 |    make_u32regex_iterator(const UnicodeString& s,  | 
|---|
| 265 |                           const u32regex& e,  | 
|---|
| 266 |                           regex_constants::match_flag_type m = regex_constants::match_default);</PRE> | 
|---|
| 267 |       <P> | 
|---|
| 268 |       <P>Each of these overloads returns an iterator that enumerates all occurrences of  | 
|---|
| 269 |          expression <EM>e</EM>, in text <EM>s</EM>, using match_flags <EM>m.</EM></P> | 
|---|
| 270 |       <P><STRONG>Example</STRONG>: search for international currency symbols, along with  | 
|---|
| 271 |          their associated numeric value:</P> | 
|---|
| 272 |       <PRE> | 
|---|
| 273 | void enumerate_currencies(const std::string& text) | 
|---|
| 274 | { | 
|---|
| 275 |    // enumerate and print all the currency symbols, along | 
|---|
| 276 |    // with any associated numeric values: | 
|---|
| 277 |    const char* re =  | 
|---|
| 278 |       "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?" | 
|---|
| 279 |       "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?" | 
|---|
| 280 |       "(?(1)" | 
|---|
| 281 |          "|(?(2)" | 
|---|
| 282 |             "[[:Cf:][:Cc:][:Z*:]]*" | 
|---|
| 283 |          ")" | 
|---|
| 284 |          "[[:Sc:]]" | 
|---|
| 285 |       ")"; | 
|---|
| 286 |    boost::u32regex r = boost::make_u32regex(re); | 
|---|
| 287 |    boost::u32regex_iterator<std::string::const_iterator> i(boost::make_u32regex_iterator(text, r)), j; | 
|---|
| 288 |    while(i != j) | 
|---|
| 289 |    { | 
|---|
| 290 |       std::cout << (*i)[0] << std::endl; | 
|---|
| 291 |       ++i; | 
|---|
| 292 |    } | 
|---|
| 293 | }</PRE> | 
|---|
| 294 |       <P> | 
|---|
| 295 |       <P>Calling | 
|---|
| 296 |       </P> | 
|---|
| 297 |       <PRE>enumerate_currencies(" $100.23 or £198.12 ");</PRE> | 
|---|
| 298 |       <P>Yields the output:</P> | 
|---|
| 299 |       <PRE>$100.23<BR>£198.12</PRE> | 
|---|
| 300 |       <P>Provided of course that the input is encoded as UTF-8.</P> | 
|---|
| 301 |       <H3><A name="u32regex_token_iterator"></A>u32regex_token_iterator</H3> | 
|---|
| 302 |       <P>Type u32regex_token_iterator is in all respects the same as <A href="regex_token_iterator.html"> | 
|---|
| 303 |             regex_token_iterator</A> except that since the regular expression type is  | 
|---|
| 304 |          always u32regex it only takes one template parameter (the iterator type).   | 
|---|
| 305 |          It also calls u32regex_search internally, allowing it to interface correctly  | 
|---|
| 306 |          with UTF-8, UTF-16, and UTF-32 data:</P> | 
|---|
| 307 |       <PRE>template <class BidirectionalIterator> | 
|---|
| 308 | class u32regex_token_iterator | 
|---|
| 309 | { | 
|---|
| 310 |    // for members see <A href="regex_token_iterator.html">regex_token_iterator</A> | 
|---|
| 311 | }; | 
|---|
| 312 |  | 
|---|
| 313 | typedef u32regex_token_iterator<const char*>     utf8regex_token_iterator; | 
|---|
| 314 | typedef u32regex_token_iterator<const UChar*>    utf16regex_token_iterator; | 
|---|
| 315 | typedef u32regex_token_iterator<const UChar32*>  utf32regex_token_iterator; | 
|---|
| 316 | </PRE> | 
|---|
| 317 |       <P>In order to simplify the construction of a u32regex_token_iterator from a  | 
|---|
| 318 |          string, there are a series of non-member helper functions called  | 
|---|
| 319 |          make_u32regex_token_iterator:</P> | 
|---|
| 320 |       <PRE> | 
|---|
| 321 | u32regex_token_iterator<const char*>  | 
|---|
| 322 |    make_u32regex_token_iterator(const char* s,  | 
|---|
| 323 |                                 const u32regex& e,  | 
|---|
| 324 |                                 int sub,  | 
|---|
| 325 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 326 |                                 | 
|---|
| 327 | u32regex_token_iterator<const wchar_t*>  | 
|---|
| 328 |    make_u32regex_token_iterator(const wchar_t* s,  | 
|---|
| 329 |                                 const u32regex& e,  | 
|---|
| 330 |                                 int sub,  | 
|---|
| 331 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 332 |                                  | 
|---|
| 333 | u32regex_token_iterator<const UChar*>  | 
|---|
| 334 |    make_u32regex_token_iterator(const UChar* s,  | 
|---|
| 335 |                                 const u32regex& e,  | 
|---|
| 336 |                                 int sub,  | 
|---|
| 337 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 338 |                                  | 
|---|
| 339 | template <class charT, class Traits, class Alloc> | 
|---|
| 340 | u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>  | 
|---|
| 341 |    make_u32regex_token_iterator(const std::basic_string<charT, Traits, Alloc>& s,  | 
|---|
| 342 |                                 const u32regex& e,  | 
|---|
| 343 |                                 int sub,  | 
|---|
| 344 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 345 |                                  | 
|---|
| 346 | u32regex_token_iterator<const UChar*>  | 
|---|
| 347 |    make_u32regex_token_iterator(const UnicodeString& s,  | 
|---|
| 348 |                                 const u32regex& e,  | 
|---|
| 349 |                                 int sub,  | 
|---|
| 350 |                                 regex_constants::match_flag_type m = regex_constants::match_default);</PRE> | 
|---|
| 351 |       <P> | 
|---|
| 352 |       <P>Each of these overloads returns an iterator that enumerates all occurrences of  | 
|---|
| 353 |          marked sub-expression <EM>sub</EM> in regular expression <EM>e</EM>, found  | 
|---|
| 354 |          in text <EM>s</EM>, using match_flags <EM>m.</EM></P> | 
|---|
| 355 |       <PRE> | 
|---|
| 356 | template <std::size_t N> | 
|---|
| 357 | u32regex_token_iterator<const char*>  | 
|---|
| 358 |    make_u32regex_token_iterator(const char* p,  | 
|---|
| 359 |                                 const u32regex& e,  | 
|---|
| 360 |                                 const int (&submatch)[N],  | 
|---|
| 361 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 362 |                                  | 
|---|
| 363 | template <std::size_t N> | 
|---|
| 364 | u32regex_token_iterator<const wchar_t*>  | 
|---|
| 365 |    make_u32regex_token_iterator(const wchar_t* p,  | 
|---|
| 366 |                                 const u32regex& e,  | 
|---|
| 367 |                                 const int (&submatch)[N],  | 
|---|
| 368 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 369 |                                  | 
|---|
| 370 | template <std::size_t N> | 
|---|
| 371 | u32regex_token_iterator<const UChar*>  | 
|---|
| 372 |    make_u32regex_token_iterator(const UChar* p,  | 
|---|
| 373 |                                 const u32regex& e,  | 
|---|
| 374 |                                 const int (&submatch)[N],  | 
|---|
| 375 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 376 |                                  | 
|---|
| 377 | template <class charT, class Traits, class Alloc, std::size_t N> | 
|---|
| 378 | u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>  | 
|---|
| 379 |    make_u32regex_token_iterator(const std::basic_string<charT, Traits, Alloc>& p,  | 
|---|
| 380 |                                 const u32regex& e,  | 
|---|
| 381 |                                 const int (&submatch)[N],  | 
|---|
| 382 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 383 |                                  | 
|---|
| 384 | template <std::size_t N> | 
|---|
| 385 | u32regex_token_iterator<const UChar*>  | 
|---|
| 386 |    make_u32regex_token_iterator(const UnicodeString& s,  | 
|---|
| 387 |                                 const u32regex& e,  | 
|---|
| 388 |                                 const int (&submatch)[N],  | 
|---|
| 389 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 390 | </PRE> | 
|---|
| 391 |       <P>Each of these overloads returns an iterator that enumerates one sub-expression  | 
|---|
| 392 |          for each <EM>submatch</EM> in regular expression <EM>e</EM>, found in  | 
|---|
| 393 |          text <EM>s</EM>, using match_flags <EM>m.</EM></P> | 
|---|
| 394 |       <PRE> | 
|---|
| 395 | u32regex_token_iterator<const char*>  | 
|---|
| 396 |    make_u32regex_token_iterator(const char* p,  | 
|---|
| 397 |                                 const u32regex& e,  | 
|---|
| 398 |                                 const std::vector<int>& submatch,  | 
|---|
| 399 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 400 |                                  | 
|---|
| 401 | u32regex_token_iterator<const wchar_t*>  | 
|---|
| 402 |    make_u32regex_token_iterator(const wchar_t* p,  | 
|---|
| 403 |                                 const u32regex& e,  | 
|---|
| 404 |                                 const std::vector<int>& submatch,  | 
|---|
| 405 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 406 |                                  | 
|---|
| 407 | u32regex_token_iterator<const UChar*>  | 
|---|
| 408 |    make_u32regex_token_iterator(const UChar* p,  | 
|---|
| 409 |                                 const u32regex& e,  | 
|---|
| 410 |                                 const std::vector<int>& submatch,  | 
|---|
| 411 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 412 |                                  | 
|---|
| 413 | template <class charT, class Traits, class Alloc> | 
|---|
| 414 | u32regex_token_iterator<typename std::basic_string<charT, Traits, Alloc>::const_iterator>  | 
|---|
| 415 |    make_u32regex_token_iterator(const std::basic_string<charT, Traits, Alloc>& p,  | 
|---|
| 416 |                                 const u32regex& e,  | 
|---|
| 417 |                                 const std::vector<int>& submatch,  | 
|---|
| 418 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 419 |                                  | 
|---|
| 420 | u32regex_token_iterator<const UChar*>  | 
|---|
| 421 |    make_u32regex_token_iterator(const UnicodeString& s,  | 
|---|
| 422 |                                 const u32regex& e,  | 
|---|
| 423 |                                 const std::vector<int>& submatch,  | 
|---|
| 424 |                                 regex_constants::match_flag_type m = regex_constants::match_default); | 
|---|
| 425 | </PRE> | 
|---|
| 426 |       <P>Each of these overloads returns an iterator that enumerates one sub-expression  | 
|---|
| 427 |          for each <EM>submatch</EM> in regular expression <EM>e</EM>, found in  | 
|---|
| 428 |          text <EM>s</EM>, using match_flags <EM>m.</EM></P> | 
|---|
| 429 |       <P><STRONG>Example</STRONG>: search for international currency symbols, along with  | 
|---|
| 430 |          their associated numeric value:</P> | 
|---|
| 431 |       <PRE> | 
|---|
| 432 | void enumerate_currencies2(const std::string& text) | 
|---|
| 433 | { | 
|---|
| 434 |    // enumerate and print all the currency symbols, along | 
|---|
| 435 |    // with any associated numeric values: | 
|---|
| 436 |    const char* re =  | 
|---|
| 437 |       "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?" | 
|---|
| 438 |       "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?" | 
|---|
| 439 |       "(?(1)" | 
|---|
| 440 |          "|(?(2)" | 
|---|
| 441 |             "[[:Cf:][:Cc:][:Z*:]]*" | 
|---|
| 442 |          ")" | 
|---|
| 443 |          "[[:Sc:]]" | 
|---|
| 444 |       ")"; | 
|---|
| 445 |    boost::u32regex r = boost::make_u32regex(re); | 
|---|
| 446 |    boost::u32regex_token_iterator<std::string::const_iterator>  | 
|---|
| 447 |       i(boost::make_u32regex_token_iterator(text, r, 1)), j; | 
|---|
| 448 |    while(i != j) | 
|---|
| 449 |    { | 
|---|
| 450 |       std::cout << *i << std::endl; | 
|---|
| 451 |       ++i; | 
|---|
| 452 |    } | 
|---|
| 453 | } | 
|---|
| 454 | </PRE> | 
|---|
| 455 |       <P> | 
|---|
| 456 |          <HR> | 
|---|
| 457 |       <p>Revised   | 
|---|
| 458 |          <!--webbot bot="Timestamp" S-Type="EDITED" S-Format="%d %B, %Y" startspan --> | 
|---|
| 459 |          05 Jan 2005   | 
|---|
| 460 |          <!--webbot bot="Timestamp" endspan i-checksum="39359" --></p> | 
|---|
| 461 |       <p><i>© Copyright John Maddock 2005</i></p> | 
|---|
| 462 |       <P><I>Use, modification and distribution are subject to the Boost Software License,  | 
|---|
| 463 |             Version 1.0. (See accompanying file <A href="../../../LICENSE_1_0.txt">LICENSE_1_0.txt</A> | 
|---|
| 464 |             or copy at <A href="http://www.boost.org/LICENSE_1_0.txt">http://www.boost.org/LICENSE_1_0.txt</A>)</I></P> | 
|---|
| 465 |    </body> | 
|---|
| 466 | </html> | 
|---|
| 467 |  | 
|---|
| 468 |  | 
|---|