Google

Main Page   Namespace List   Compound List   File List   Compound Members   File Members  

pcre++.cc

Go to the documentation of this file.
00001 /*
00002  *
00003  *   $Id: pcre++.cc,v 1.2 2002/01/02 01:25:30 zarahg Exp $
00004  *
00005  *  This file  is part of the PCRE++ Class Library.
00006  *
00007  *  By  accessing  this software,  PCRE++, you  are  duly informed
00008  *  of and agree to be  bound  by the  conditions  described below
00009  *  in this notice:
00010  *
00011  *  This software product,  PCRE++,  is developed by Thomas Linden
00012  *  and  copyrighted (C) 2002  by  Thomas Linden,  with all rights 
00013  *  reserved.
00014  *
00015  *  There  is no charge for PCRE++ software.  You can redistribute
00016  *  it and/or modify it under the terms of the GNU  Lesser General
00017  *  Public License, which is incorporated by reference herein.
00018  *
00019  *  PCRE++ is distributed WITHOUT ANY WARRANTY, IMPLIED OR EXPRESS,
00020  *  OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE or that
00021  *  the use of it will not infringe on any third party's intellec-
00022  *  tual property rights.
00023  *
00024  *  You should have received a copy of the GNU Lesser General Public
00025  *  License along with PCRE++.  Copies can also be obtained from:
00026  *
00027  *    http://www.gnu.org/licenses/lgpl.txt
00028  *
00029  *  or by writing to:
00030  *
00031  *  Free Software Foundation, Inc.
00032  *  59 Temple Place, Suite 330
00033  *  Boston, MA 02111-1307
00034  *  USA
00035  *
00036  *  Or contact:
00037  *
00038  *   "Thomas Linden" <tom@daemon.de>
00039  *
00040  *
00041  */
00042 
00043 
00044 #include "pcre++.h"
00045 
00046 
00047 /*
00048  * CONSTRUCTORS
00049  */
00050 Pcre::Pcre(const string& expression) {
00051   _expression   = expression;
00052   _flags        = 0;
00053   case_t = global_t = false;
00054   zero();
00055   Compile(0);
00056 }
00057 
00058 Pcre::Pcre(const string& expression, const string& flags) {
00059   _expression   = expression;
00060   unsigned int FLAG = 0;
00061 
00062   for(unsigned int flag=0; flag<flags.length(); flag++) {
00063     switch(flags[flag]) {
00064     case 'i': FLAG |= PCRE_CASELESS;  case_t = true;   break;
00065     case 'm': FLAG |= PCRE_MULTILINE;                  break;
00066     case 's': FLAG |= PCRE_DOTALL;                     break;
00067     case 'x': FLAG |= PCRE_EXTENDED;                   break;
00068     case 'g':                         global_t = true; break;
00069     }
00070   }
00071 
00072   _flags = FLAG;
00073 
00074   zero();
00075   Compile(FLAG);
00076 }
00077 
00078 Pcre::Pcre(const Pcre &P) {
00079   _expression = P._expression;
00080   _flags      = P._flags;
00081   case_t      = P.case_t;
00082   global_t    = P.global_t;
00083   zero();
00084   Compile(_flags);
00085 }
00086 
00087 Pcre::Pcre() {
00088   zero();
00089 }
00090 
00091 
00092 
00093 
00094 
00095 
00096 
00097 /*
00098  * Destructor
00099  */
00100 Pcre::~Pcre() {
00101   /* avoid deleting of uninitialized pointers */
00102   if (p_pcre != NULL) {
00103     pcre_free(p_pcre);
00104   }
00105   if (p_pcre_extra != NULL) {
00106     pcre_free(p_pcre_extra);
00107   }
00108   if(sub_vec != NULL) {
00109     delete[] sub_vec;
00110   }
00111   if(num_matches > 0) {
00112     delete resultset;
00113   }
00114   if(err_str != NULL) {
00115     delete err_str;
00116   }
00117 }
00118 
00119 
00120 
00121 
00122 /*
00123  * operator= definitions
00124  */
00125 const Pcre& Pcre::operator = (const string& expression) {
00126   /* reset the object and re-intialize it */
00127   reset();
00128   _expression = expression;
00129   _flags      = 0;
00130   case_t = global_t = false;
00131   Compile(0);
00132   return *this;
00133 }
00134 
00135 
00136 const Pcre& Pcre::operator = (const Pcre &P) {
00137   reset();
00138   _expression = P._expression;
00139   _flags      = P._flags;
00140   case_t      = P.case_t;
00141   global_t    = P.global_t;
00142   zero();
00143   Compile(_flags);
00144   return *this;
00145 }
00146 
00147 
00148 
00149 
00150 
00151 
00152 /*
00153  * mem resetting methods
00154  */
00155 void Pcre::zero() {
00156   /* what happens if p_pcre is already allocated? hm ... */
00157   p_pcre_extra = NULL;
00158   p_pcre       = NULL;
00159   sub_vec      = NULL;
00160   resultset    = NULL;
00161   err_str      = NULL;
00162   num_matches  = -1;
00163 }
00164 
00165 void Pcre::reset() {
00166   did_match   = false;
00167   num_matches = -1;
00168 }
00169 
00170 
00171 
00172 
00173 
00174 /*
00175  * compile the expression
00176  */
00177 void Pcre::Compile(int flags) {
00178   p_pcre       = pcre_compile((char *)_expression.c_str(), flags,
00179                               (const char **)(&err_str), &erroffset, NULL);
00180 
00181   if(p_pcre == NULL) {
00182     /* umh, that's odd, the parser should not fail at all */
00183     string Error = err_str;
00184     throw exception("pcre_compile(..) failed: " + Error);
00185   }
00186 
00187   /* calculate the number of substrings we are willing to catch */
00188   int where;
00189   int info = pcre_fullinfo( p_pcre, p_pcre_extra, PCRE_INFO_CAPTURECOUNT, &where);
00190   if(info == 0) {
00191     sub_len = (where +2) * 3; /* see "man pcre" for the exact formula */
00192   }
00193   else {
00194     throw exception(info);
00195   }
00196   reset();
00197 }
00198 
00199 
00200 
00201 
00202 /*
00203  * API methods
00204  */
00205 bool Pcre::search(const string& stuff, int OffSet) {
00206   return dosearch(stuff, OffSet);
00207 }
00208 
00209 bool Pcre::search(const string& stuff) {
00210   return dosearch(stuff, 0);
00211 }
00212 
00213 bool Pcre::dosearch(const string& stuff, int OffSet) {
00214   reset();
00215   sub_vec = new int[sub_len];
00216   int num = pcre_exec(p_pcre, p_pcre_extra, (char *)stuff.c_str(),
00217                         (int)stuff.length(), OffSet, 0, (int *)sub_vec, sub_len);
00218 
00219   if(num < 0) {
00220     /* no match at all */
00221     return false;
00222   }
00223   else if(num == 0) {
00224     /* vector too small, there were too many substrings in stuff */
00225     return false;
00226   }
00227   else if(num == 1) {
00228     /* we had a match, but without substrings */
00229     did_match = true;
00230     num_matches = 0;
00231     return true;
00232   }
00233   else if(num > 1) {
00234     /* we had matching substrings */
00235     resultset = new Array;
00236     const char **stringlist;
00237     did_match = true;
00238     num_matches = num - 1;
00239 
00240     int res = pcre_get_substring_list((char *)stuff.c_str(), sub_vec, num, &stringlist);
00241     if(res == 0) {
00242       for(int i=1; i<num; i++) {
00243         resultset->push_back(stringlist[i]);
00244       }
00245       pcre_free_substring_list(stringlist);
00246     }
00247     else {
00248       throw exception(res);
00249     }
00250     return true;
00251   }
00252   else {
00253     /* some other uncommon error occured */
00254     return false;
00255   }
00256 }
00257 
00258 Array* Pcre::get_sub_strings() {
00259   if(resultset != NULL)
00260     return resultset;
00261   else
00262     return NULL;
00263 }
00264 
00265 string Pcre::get_match(int pos) {
00266   if(pos >= 0 && pos < num_matches) {
00267     ArrayIterator P = resultset->begin() + pos;
00268     return *P;
00269   }
00270   else {
00271     throw exception("out of range");
00272   }
00273 }
00274 
00275 int Pcre::get_match_start() {
00276   if (sub_vec)
00277     return sub_vec[0];
00278   else
00279     return -1;
00280 }
00281 
00282 int Pcre::get_match_end() {
00283   if (sub_vec)
00284     return sub_vec[1] - 1;
00285   else
00286     return -1;
00287 }
00288 
00289 int Pcre::get_match_start(int pos) {
00290   if(pos >= 0 && pos <= num_matches) {
00291     /*
00292      * sub_vec[0] and [1] is the start/end of the entire string.
00293      */
00294     return sub_vec[ (++pos) * 2 ];
00295   }
00296   else {
00297     throw exception("out of range");
00298   }  
00299 }
00300 
00301 int Pcre::get_match_end(int pos) {
00302   if(pos >= 0 && pos <= num_matches) {
00303     /*
00304      * the end offset of a subpattern points to
00305      * the first offset of the next substring,
00306      * therefore -1
00307      */
00308     return sub_vec[ ((++pos) * 2) + 1 ] - 1;
00309   }
00310   else {
00311     throw exception("out of range");
00312   }
00313 }
00314 
00315 size_t Pcre::get_match_length(int pos) {
00316   if(pos >= 0 && pos < num_matches) {
00317     ArrayIterator P = resultset->begin() + pos;
00318     return P->length();
00319   }
00320   else {
00321     throw exception("out of range");
00322   }
00323 }
00324 
00325 Array Pcre::_split(const string& piece, int limit, int start_offset, int end_offset) {
00326   Array Splitted;
00327   /* _expression will be used as delimiter */
00328   if(_expression.length() == 1) {
00329     /* use the plain c++ way, ignore the pre-compiled p_pcre */
00330     string buffer, _delimiter, _piece;
00331     char z;
00332     if(case_t) {
00333       z = toupper(_expression[0]);
00334       for(size_t pos=0; pos < piece.length(); pos++) {
00335         _piece += (char)toupper(piece[pos]);
00336       }
00337     }
00338     else {
00339       z = _expression[0];
00340       _piece = piece;
00341     }
00342     for(size_t pos=0; pos<piece.length(); pos++) {
00343       if(_piece[pos] == z) {
00344         Splitted.push_back(buffer);
00345         buffer = "";
00346       }
00347       else {
00348         buffer += piece[pos];
00349       }
00350     }
00351     if(buffer != "") {
00352       Splitted.push_back(buffer);
00353     }
00354   }
00355   else {
00356     /* use the regex way */
00357     if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) {
00358       /* oh, oh - the pre-compiled expression does not contain brackets */
00359       pcre_free(p_pcre);
00360       pcre_free(p_pcre_extra);
00361       
00362       pcre       *_p = NULL;
00363       pcre_extra *_e = NULL;;
00364 
00365       p_pcre = _p;
00366       p_pcre_extra = _e;
00367 
00368       _expression = "(" + _expression + ")";
00369       Compile(_flags);
00370     }
00371     int num_pieces=0, pos=0, piece_end = 0, piece_start = 0;
00372     for(;;) {
00373       if(search(piece, pos) == true) {
00374         if(matches() > 0) {
00375           piece_end   = get_match_start(0) - 1;
00376           piece_start = pos;
00377           pos = piece_end + 1 + get_match_length(0);
00378           string junk(piece, piece_start, (piece_end - piece_start)+1);
00379           num_pieces++;
00380           if( (limit != 0 && num_pieces < limit) || limit == 0) {
00381             if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) {
00382               if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) {
00383                 /* we are within the allowed range, so just add the grab */
00384                 Splitted.push_back(junk);
00385               }
00386             }
00387           }
00388         }
00389       }
00390       else {
00391         /* the rest of the string, there are no more delimiters */
00392         string junk(piece, pos, (piece.length() - pos));
00393         num_pieces++;
00394         if( (limit != 0 && num_pieces < limit) || limit == 0) {
00395           if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) {
00396             if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) {
00397               /* we are within the allowed range, so just add the grab */
00398               Splitted.push_back(junk);
00399             }
00400           }
00401         }
00402         break;
00403       }
00404     } // for()
00405   } // if(_expression.length()
00406   return Splitted;
00407 }
00408 
00409 Array Pcre::split(const string& piece) {
00410   return _split(piece, 0, 0, 0);
00411 }
00412 
00413 Array Pcre::split(const string& piece, int limit) {
00414   return _split(piece, limit, 0, 0);
00415 }
00416 
00417 Array Pcre::split(const string& piece, int limit, int start_offset) {
00418   return _split(piece, limit, start_offset, 0);
00419 }
00420 
00421 Array Pcre::split(const string& piece, int limit, int start_offset, int end_offset) {
00422   return _split(piece, limit, start_offset, end_offset);
00423 }
00424 
00425 Array Pcre::split(const string& piece, vector<int> positions) {
00426   Array PreSplitted = _split(piece, 0, 0, 0);
00427   Array Splitted;
00428   for(vector<int>::iterator vecIt=positions.begin(); vecIt != positions.end(); ++vecIt) {
00429     Splitted.push_back(PreSplitted[*vecIt]);
00430   }
00431   return Splitted;
00432 }
00433 
00434 
00435 
00436 string Pcre::replace(const string& piece, const string& with) {
00437   string Replaced(piece);
00438 
00439   /*
00440    * very first job: look, if the expression already contains
00441    * braces, if yes, do not add braces, else, do it
00442    */
00443   Pcre braces("[^\\\\]\\(.*[^\\\\]\\)"); // perlish: [^\\]\(.*[^\\]\)
00444   if(! braces.search(_expression)) {
00445     //  if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) {
00446     /* oh, oh - the pre-compiled expression does not contain brackets */
00447 
00448     /* recreate the p_pcre* objects to avoid memory leaks */
00449     pcre_free(p_pcre);
00450     pcre_free(p_pcre_extra);
00451       
00452     pcre       *_p = NULL;
00453     pcre_extra *_e = NULL;;
00454 
00455     p_pcre = _p;
00456     p_pcre_extra = _e;
00457 
00458     _expression = "(" + _expression + ")";
00459     Compile(_flags);
00460   }
00461 
00462   if(search(piece)) {
00463     /* we found at least one match */
00464     string use_with = _replace_vars(with);
00465     if(!global_t) {
00466       /*
00467        * only once, use the entire match
00468        * Patch submitted by Mark Carrington <mark@mutantpenguin.co.uk>
00469        */
00470       if(matched() && matches() >= 1) {
00471         int len = get_match_end() - get_match_start() + 1;
00472         Replaced.replace(get_match_start(0), len, use_with);
00473       }
00474     }
00475     else {
00476       /*
00477        * global replace.
00478        *
00479        * We need to keep checking the line after it is modified to see the next match.
00480        * Especially \s is something of a bitch as it can be a newline, return carriage,
00481        * space, tab, etc ... so we have to keep  searching for the next type.
00482        * Patch submitted by Jim Hull <imaginos@imaginos.net>
00483        */
00484       string sLeftOver = Replaced;
00485       int iCurPosition = 0;
00486       while( search( sLeftOver ) ) {
00487         if( matched() && matches() >= 1 ) {
00488           int len = 0;
00489           string lookfor;
00490           lookfor.erase();
00491           int match_pos;
00492           for (match_pos = 0; match_pos < matches(); match_pos++) {
00493             len += ((get_match_end(match_pos) - get_match_start(match_pos)) + 1);
00494             lookfor += get_match(match_pos);
00495           }
00496           match_pos = Replaced.find( lookfor, iCurPosition );
00497           Replaced.replace(match_pos, len, use_with);
00498           iCurPosition = ( match_pos + use_with.length() );
00499           sLeftOver = Replaced.substr( iCurPosition, string::npos );
00500         }
00501       }
00502     }
00503   }
00504   return Replaced;
00505 }
00506 
00507 
00508 
00509 string Pcre::_replace_vars(const string& piece) {
00510   Pcre dollar("\\$[0-9]+");
00511   string with = piece;
00512   if(dollar.search(with)) {
00513     for(int index=0; index < num_matches; index++) {
00514       /* do it for each existing sub string */
00515       string sub   = get_match(index); // what "$1" resulted
00516       ostringstream num;
00517       num << index+1;
00518       string dollar_num = "(\\$" + num.str() + ")";
00519       Pcre subsplit(dollar_num); // "\\$1"
00520       // normally 2 (or more) parts, the one in front of and the other one after "$1"
00521       Array splitted = subsplit.split(with); 
00522       string Replaced;
00523       for(size_t pos=0; pos < splitted.size(); pos++) {
00524         if(pos == (splitted.size() - 1))
00525           Replaced += splitted[pos];
00526         else
00527           Replaced += splitted[pos] + sub;
00528       }
00529       with = Replaced; // well, one part is done
00530     }
00531     return with;
00532   }
00533   else {
00534     /* hm, no $[0-9]+ stuff, so just return it untouched */
00535     return with;
00536   }
00537 }

Generated on Tue Jul 16 22:14:38 2002 for PCRE++ by doxygen1.2.13.1 written by Dimitri van Heesch, © 1997-2001