string_manipulation.cpp

Go to the documentation of this file.
00001 #ifndef STRING_MANIPULATION_IMPLEMENTATION_FILE
00002 #define STRING_MANIPULATION_IMPLEMENTATION_FILE
00003 
00004 /*****************************************************************************\
00005 *                                                                             *
00006 *  Name   : string_manipulation                                               *
00007 *  Author : Chris Koeritz                                                     *
00008 *  Author : Brit Minor                                                        *
00009 *  Author : Aaron Buchanan                                                    *
00010 *                                                                             *
00011 *******************************************************************************
00012 * Copyright (c) 2000-$now By Author.  This program is free software; you can  *
00013 * redistribute it and/or modify it under the terms of the GNU General Public  *
00014 * License as published by the Free Software Foundation; either version 2 of   *
00015 * the License or (at your option) any later version.  This is online at:      *
00016 *     http://www.fsf.org/copyleft/gpl.html                                    *
00017 * Please send any updates to: fred@gruntose.com                               *
00018 \*****************************************************************************/
00019 
00020 #include "parser_bits.h"
00021 #include "string_convert.h"  // pulled in to ensure it's used in textual.
00022 #include "string_manipulation.h"
00023 
00024 #include <basis/byte_array.h>
00025 #include <basis/chaos.h>
00026 #include <basis/function.h>
00027 #include <basis/istring.h>
00028 #include <basis/log_base.h>
00029 #include <basis/mutex.h>
00030 #include <data_struct/static_memory_gremlin.h>
00031 
00032 SAFE_STATIC_CONST(istring_object, string_manipulation::splitter_finding_set,
00033     ("\t\r\n -,;?!.:"))
00034   // any of these characters make a valid place to break a line.
00035 
00036 // by CAK.
00037 istring string_manipulation::make_random_name(int min, int max)
00038 {
00039   chaos rando;
00040   int length = rando.inclusive(min, max);
00041     // pick a size for the string.
00042   istring to_return;
00043   for (int i = 0; i < length; i++) {
00044     int chah = rando.inclusive(0, 26);
00045       // use a range one larger than alphabet size.
00046     char to_add = 'a' + chah;
00047     if (chah == 26) to_add = '_';
00048       // patch the extra value to be a separator.
00049     to_return += to_add;
00050   }
00051   return to_return;
00052 }
00053 
00054 // by brit.
00055 bool string_manipulation::quote_string(const istring &to_quote,
00056     istring &quoted_string)
00057 {
00058   bool bRet = true;
00059 
00060   const char *dquote = "\"";
00061   const char *squote = "\'";
00062 
00063   bool bHasDQuote = to_quote.contains(dquote);
00064   bool bHasSQuote = to_quote.contains(squote);
00065 
00066   if (bHasDQuote && bHasSQuote) {
00067     // Can't quote this string unambiguously.
00068     quoted_string = to_quote;
00069     bRet = false;
00070   } else if (bHasDQuote)
00071   {
00072     quoted_string = squote;
00073     quoted_string += to_quote;
00074     quoted_string += squote;
00075   }
00076   else
00077   {
00078     quoted_string = dquote;
00079     quoted_string += to_quote;
00080     quoted_string += dquote;
00081   }
00082 
00083   return bRet;
00084 }
00085 
00086 // by CAK.
00087 istring string_manipulation::long_line(char line_item, int repeat)
00088 { return istring(line_item, repeat); }
00089 
00090 // by CAK.
00091 istring string_manipulation::indentation(int spaces)
00092 {
00093   istring s;
00094   for (int i = 0; i < spaces; i++) s += ' ';
00095   return s;
00096 }
00097 
00098 // by Aaron:
00099 istring &string_manipulation::escape_chars(istring &input_string)
00100 {
00101     // look for backslash or quote characters in the string.
00102     // if any are found then insert an additional backslash to
00103     //  escape the character
00104     const char *backslash = "\\";
00105     for( int i = 0; i < input_string.length(); ++i )
00106     {
00107         const char cur = input_string[i];
00108         if( ('\\' == cur) || ('\"' == cur) )
00109         {
00110             input_string.insert(i, backslash);
00111             ++i;    // skip back to current character
00112         }
00113     }
00114     return input_string;
00115 }
00116 
00117 // by Aaron:
00118 istring &string_manipulation::unescape_chars(istring &input_string)
00119 {
00120     for( int i = 1; i < input_string.length(); ++i )
00121     {
00122         if( ('\\' == input_string[i-1]) && 
00123             (('\\' == input_string[i]) || ('\"' == input_string[i])) )
00124         {
00125             input_string.zap(i-1, i-1);
00126             // i now points to the next char in the buffer...the loop will
00127             //  advance again, but this is correct or else we will have problems
00128             //  if there is something like \\\\.
00129         }
00130     }
00131     return input_string;
00132 }
00133 
00134 // by Aaron:
00135 bool string_manipulation::substring(const istring &look_in,
00136     const istring &after, const istring &before, istring &found)
00137 {
00138     int start = look_in.find(after);
00139     if( start < 0 )
00140         return false;
00141 
00142     start += after.length();
00143     const int end = look_in.find(before, start);
00144     if( end < 0 )
00145         return false;
00146 
00147     found = look_in.substring(start, end - 1);
00148     return true;
00149 }
00150 
00151 void string_manipulation::carriage_returns_to_spaces(istring &to_strip)
00152 {
00153   for (int j = 0; j < to_strip.length(); j++) {
00154     int original_j = j;  // track where we started looking.
00155     if (!parser_bits::is_eol(to_strip[j])) continue;
00156     // we have found at least one CR.  let's see what else there is.
00157     if ( (to_strip[j] == '\r') && (to_strip[j + 1] == '\n') ) {
00158       // this is looking like a DOS CR.  let's skip that now.
00159       j++;
00160     }
00161     j++;  // skip the one we know is a CR.
00162     if (parser_bits::is_eol(to_strip[j])) {
00163       // we are seeing more than one carriage return in a row.  let's
00164       // truncate that down to just one.
00165       j++;
00166       while (parser_bits::is_eol(to_strip[j]) && (j < to_strip.length()))
00167         j++;  // skip to next one that might not be CR.
00168       // now we think we know where there's this huge line of CRs.  we will
00169       // turn them all into spaces except the first.
00170       to_strip[original_j] = '\n';
00171       for (int k = original_j + 1; k < j; k++) to_strip[k] = ' ';
00172       // put the index back so we'll start looking at the non-CR char.
00173       j--;
00174       continue;  // now skip back out to the main loop.
00175     } else {
00176       // we see only one carriage return, which we will drop in favor of
00177       // joining those lines together.  we iterate here since we might have
00178       // seen a DOS CR taking up two spaces.
00179       for (int k = original_j; k < j; k++) to_strip[k] = ' ';
00180     }
00181   }
00182 
00183 }
00184 
00185 // by CAK.
00186 void string_manipulation::split_lines(const istring &input_in, istring &output,
00187     int min_column, int max_column)
00188 {
00189   output = "";
00190   if (max_column - min_column + 1 < 2) return;  // what's the point?
00191 
00192   istring input = input_in;  // make a copy to work on.
00193   carriage_returns_to_spaces(input);
00194 
00195   int col = min_column;
00196   istring indent_add = indentation(min_column);
00197   output = indent_add;  // start with the extra space.
00198 
00199   bool just_had_break = false;
00200     // set true if we just handled a line break in the previous loop.
00201   bool put_accum_before_break = false;  // true if we must postpone CR.
00202   istring accumulated;
00203     // holds stuff to print on next go-round.
00204 
00205   // now we parse across the list counting up our line size and making sure
00206   // we don't go over it.
00207   for (int j = 0; j < input.length(); j++) {
00208 
00209 //char to_print = input[j];
00210 //if (parser_bits::is_eol(to_print)) to_print = '_';
00211 //printf("[%d: val=%d, '%c', col=%d]\n", j, to_print, to_print, col);
00212 //fflush(0);
00213 
00214     // handle the carriage return if it was ordered.
00215     if (just_had_break) {
00216       if (put_accum_before_break) {
00217         output += accumulated;
00218         // strip off any spaces from the end of the line.
00219         output.strip_spaces(istring::FROM_END);
00220         output += log_base::platform_ending();
00221         accumulated = "";
00222         j++;  // skip the CR that we think is there.
00223       }
00224       // strip off any spaces from the end of the line.
00225       output.strip_spaces(istring::FROM_END);
00226       output += log_base::platform_ending();
00227       col = min_column;
00228       output += indent_add;
00229       just_had_break = false;
00230       if (accumulated.length()) {
00231         output += accumulated;
00232         col += accumulated.length();
00233         accumulated = "";
00234       }
00235       j--;
00236       continue;
00237     }
00238 
00239     put_accum_before_break = false;
00240 
00241     // skip any spaces we've got at the current position.
00242     while ( (input[j] == ' ') || (input[j] == '\t') ) {
00243       j++;
00244       if (j >= input.length()) break;  // break out of subloop if past it.
00245     }
00246 
00247     if (j >= input.length()) break;  // we're past the end.
00248 
00249     // handle carriage returns when they're at the current position.
00250     char current_char = input[j];
00251     if (parser_bits::is_eol(current_char)) {
00252       just_had_break = true;  // set the state.
00253       put_accum_before_break = true;
00254       continue;
00255     }
00256 
00257 //hmmm: the portion below could be called a find word break function.
00258 
00259     bool add_dash = false;  // true if we need to break a word and add hyphen.
00260     bool break_line = false;  // true if we need to go to the next line.
00261     bool invisible = false;  // true if invisible characters were seen.
00262     bool end_sentence = false;  // true if there was a sentence terminator.
00263     bool punctuate = false;  // true if there was normal punctuation.
00264     bool keep_on_line = false;  // true if we want add current then break line.
00265     char prior_break = '\0';  // set for real below.
00266     char prior_break_plus_1 = '\0';  // ditto.
00267 
00268     // find where our next normal word break is, if possible.
00269     int next_break = input.find_any(splitter_finding_set(), j);
00270     // if we didn't find a separator, just use the end of the string.
00271     if (negative(next_break))
00272       next_break = input.length() - 1;
00273 
00274     // now we know where we're supposed to break, but we don't know if it
00275     // will all fit.
00276     prior_break = input[next_break];
00277       // hang onto the value before we change next_break.
00278     prior_break_plus_1 = input[next_break + 1];
00279       // should still be safe since we're stopping before the last zero.
00280     switch (prior_break) {
00281       case '\r': case '\n':
00282         break_line = true;
00283         just_had_break = true;
00284         put_accum_before_break = true;
00285         // intentional fall-through.
00286       case '\t': case ' ':
00287         invisible = true;
00288         next_break--;  // don't include it in what's printed.
00289         break;
00290       case '?': case '!': case '.':
00291         end_sentence = true;
00292         // if we see multiples of these, we count them as just one.
00293         while ( (input[next_break + 1] == '?')
00294             || (input[next_break + 1] == '!')
00295             || (input[next_break + 1] == '.') ) {
00296           next_break++;
00297         }
00298         // make sure that there's a blank area after the supposed punctuation.
00299         if (!parser_bits::white_space(input[next_break + 1]))
00300           end_sentence = false;
00301         break;
00302       case ',': case ';': case ':':
00303         punctuate = true;
00304         // make sure that there's a blank area after the supposed punctuation.
00305         if (!parser_bits::white_space(input[next_break + 1]))
00306           punctuate = false;
00307         break;
00308     }
00309 
00310     // we'll need to add some spaces for certain punctuation.
00311     int punct_adder = 0;
00312     if (punctuate || invisible) punct_adder = 1;
00313     if (end_sentence) punct_adder = 2;
00314 
00315     // check that we're still in bounds.
00316     int chars_added = next_break - j + 1;
00317     if (col + chars_added + punct_adder > max_column + 1) {
00318       // we need to break before the next breakable character.
00319       break_line = true;
00320       just_had_break = true;
00321       if (col + chars_added <= max_column + 1) {
00322         // it will fit without the punctuation spaces, which is fine since
00323         // it should be the end of the line.
00324         invisible = false;
00325         punctuate = false;
00326         end_sentence = false;
00327         punct_adder = 0;
00328         keep_on_line = true;
00329       } else if (min_column + chars_added > max_column + 1) {
00330         // this word won't ever fit unless we break it.
00331         int chars_left = max_column - col + 1;
00332           // remember to take out room for the dash also.
00333         if (chars_left < 2) {
00334           j--;  // stay where we are.
00335           continue;
00336         } else {
00337           next_break = j + chars_left - 2;
00338           chars_added = next_break - j + 1;
00339           if (next_break >= input.length())
00340             next_break = input.length() - 1;
00341           else if (next_break < j)
00342             next_break = j;
00343           add_dash = true;
00344         }
00345       }
00346     }
00347 
00348     istring adding_chunk = input.substring(j, next_break);
00349       // this is what we've decided the next word chunk to be added will be.
00350       // we still haven't completely decided where it goes.
00351 
00352     if (break_line) {
00353       col = min_column;
00354       if (add_dash || keep_on_line) {
00355         // include the previous stuff on the same line.
00356         output += adding_chunk;
00357         if (add_dash) output += "-";
00358         j = next_break;
00359         continue;  // done with this case.
00360       }
00361 
00362       // don't include the previous stuff; make it go to the next line.
00363       accumulated = adding_chunk;
00364       if (punctuate || invisible) {
00365         accumulated += " ";
00366       } else if (end_sentence) {
00367         accumulated += "  ";
00368       }
00369       j = next_break;
00370       continue;
00371     }
00372 
00373     // add the line normally since it should fit.
00374     output += adding_chunk;
00375     col += chars_added + punct_adder;  // add the characters added.
00376     j = next_break;
00377     just_had_break = false;  // reset the state.
00378 
00379     // handle when we processed an invisible or punctuation character.
00380     if (punctuate || invisible) {
00381       output += " ";
00382     } else if (end_sentence) {
00383       output += "  ";
00384     }
00385   }
00386   // make sure we handle any leftovers.
00387   if (accumulated.length()) {
00388     output.strip_spaces(istring::FROM_END);
00389     output += log_base::platform_ending();
00390     output += indent_add;
00391     output += accumulated;
00392   }
00393   output.strip_spaces(istring::FROM_END);
00394   output += log_base::platform_ending();
00395 }
00396 
00397 char string_manipulation::hex_to_char(byte to_convert)
00398 {
00399   if (to_convert <= 9) return char('0' + to_convert);
00400   else if ( (to_convert >= 10) && (to_convert <= 15) )
00401     return char('A' - 10 + to_convert);
00402   else return '?';
00403 }
00404 
00405 byte string_manipulation::char_to_hex(char to_convert)
00406 {
00407   if ( (to_convert >= '0') && (to_convert <= '9') )
00408     return char(to_convert - '0');
00409   else if ( (to_convert >= 'a') && (to_convert <= 'f') )
00410     return char(to_convert - 'a' + 10);
00411   else if ( (to_convert >= 'A') && (to_convert <= 'F') )
00412     return char(to_convert - 'A' + 10);
00413   else return 0;
00414 }
00415 
00416 byte_array string_manipulation::string_to_hex(const istring &to_convert)
00417 {
00418   byte_array to_return(0, NIL);
00419   for (int i = 0; i < to_convert.length() / 2; i++) {
00420     int str_index = i * 2;
00421     byte first_byte = char_to_hex(to_convert.get(str_index));
00422     byte second_byte = char_to_hex(to_convert.get(str_index + 1));
00423     byte to_stuff = byte(first_byte * 16 + second_byte);
00424     to_return.concatenate(to_stuff);
00425   }
00426   return to_return;
00427 }
00428 
00429 istring string_manipulation::hex_to_string(const byte_array &to_convert)
00430 {
00431   istring to_return;
00432   for (int i = 0; i < to_convert.length() * 2; i += 2) {
00433     int str_index = i / 2;
00434     char first_char = hex_to_char(char(to_convert.get(str_index) / 16));
00435     char second_char = hex_to_char(char(to_convert.get(str_index) % 16));
00436     to_return += istring(first_char, 1);
00437     to_return += istring(second_char, 1);
00438   }
00439   return to_return;
00440 }
00441 
00442 
00443 #endif //STRING_MANIPULATION_IMPLEMENTATION_FILE
00444 

Generated on Fri Sep 5 04:29:00 2008 for HOOPLE Libraries by  doxygen 1.5.1